github.com/fraugster/parquet-go@v0.12.0/data_store.go (about)

     1  package goparquet
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"math/bits"
     7  
     8  	"github.com/fraugster/parquet-go/parquet"
     9  )
    10  
    11  // ColumnStore is the read/write implementation for a column. It buffers a single
    12  // column's data that is to be written to a parquet file, knows how to encode this
    13  // data and will choose an optimal way according to heuristics. It also ensures the
    14  // correct decoding of column data to be read.
    15  type ColumnStore struct {
    16  	typedColumnStore
    17  
    18  	repTyp parquet.FieldRepetitionType
    19  
    20  	pages   []pageReader
    21  	pageIdx int
    22  
    23  	values *dictStore
    24  
    25  	dLevels *packedArray
    26  	rLevels *packedArray
    27  
    28  	enc     parquet.Encoding
    29  	readPos int
    30  
    31  	useDict bool
    32  
    33  	skipped bool
    34  
    35  	dataPages []*dataPage
    36  
    37  	maxPageSize int64
    38  
    39  	prevNumRecords int64 // this is just for correctly calculating how many rows are in a data page.
    40  
    41  	alloc *allocTracker
    42  }
    43  
    44  type dataPage struct {
    45  	values     []interface{}
    46  	indexList  []int32
    47  	rL         *packedArray
    48  	dL         *packedArray
    49  	numValues  int64
    50  	nullValues int64
    51  	numRows    int64
    52  	stats      *parquet.Statistics
    53  }
    54  
    55  // useDictionary is simply a function to decide to use dictionary or not.
    56  func (cs *ColumnStore) useDictionary() bool {
    57  	return cs.useDict
    58  }
    59  
    60  func (cs *ColumnStore) encoding() parquet.Encoding {
    61  	return cs.enc
    62  }
    63  
    64  func (cs *ColumnStore) repetitionType() parquet.FieldRepetitionType {
    65  	return cs.repTyp
    66  }
    67  
    68  func (cs *ColumnStore) reset(rep parquet.FieldRepetitionType, maxR, maxD uint16) {
    69  	if cs.typedColumnStore == nil {
    70  		panic("generic should be used with typed column store")
    71  	}
    72  	cs.repTyp = rep
    73  	if cs.values == nil {
    74  		cs.values = &dictStore{useDict: cs.useDict, alloc: cs.alloc}
    75  		cs.rLevels = &packedArray{}
    76  		cs.dLevels = &packedArray{}
    77  	}
    78  	cs.values.init()
    79  	cs.rLevels.reset(bits.Len16(maxR))
    80  	cs.dLevels.reset(bits.Len16(maxD))
    81  	cs.readPos = 0
    82  	cs.skipped = false
    83  	cs.prevNumRecords = 0
    84  
    85  	cs.typedColumnStore.reset(rep)
    86  }
    87  
    88  func (cs *ColumnStore) appendRDLevel(rl, dl uint16) {
    89  	cs.rLevels.appendSingle(int32(rl))
    90  	cs.dLevels.appendSingle(int32(dl))
    91  }
    92  
    93  // Add One row, if the value is null, call Add() , if the value is repeated, call all value in array
    94  // the second argument s the definition level
    95  // if there is a data the the result should be true, if there is only null (or empty array), the the result should be false
    96  func (cs *ColumnStore) add(v interface{}, dL uint16, maxRL, rL uint16) error {
    97  	// if the current column is repeated, we should increase the maxRL here
    98  	if cs.repTyp == parquet.FieldRepetitionType_REPEATED {
    99  		maxRL++
   100  	}
   101  	if rL > maxRL {
   102  		rL = maxRL
   103  	}
   104  	// the dL is a little tricky. there is some case if the REQUIRED field here are nil (since there is something above
   105  	// them is nil) they can not be the first level, but if they are in the next levels, is actually ok, but the
   106  	// level is one less
   107  	if v == nil {
   108  		cs.appendRDLevel(rL, dL)
   109  		cs.values.addValue(nil, 0)
   110  		return nil
   111  	}
   112  	vals, err := cs.getValues(v)
   113  	if err != nil {
   114  		return err
   115  	}
   116  	if len(vals) == 0 {
   117  		// the MaxRl might be increased in the beginning and increased again in the next call but for nil its not important
   118  		return cs.add(nil, dL, maxRL, rL)
   119  	}
   120  
   121  	for i, j := range vals {
   122  		cs.values.addValue(j, cs.sizeOf(j))
   123  		tmp := dL
   124  		if cs.repTyp != parquet.FieldRepetitionType_REQUIRED {
   125  			tmp++
   126  		}
   127  
   128  		if i == 0 {
   129  			cs.appendRDLevel(rL, tmp)
   130  		} else {
   131  			cs.appendRDLevel(maxRL, tmp)
   132  		}
   133  	}
   134  
   135  	return nil
   136  }
   137  
   138  func (cs *ColumnStore) estimateSize() (total int64) {
   139  	dictSize, noDictSize := cs.values.sizes()
   140  	if cs.useDictionary() {
   141  		total += dictSize
   142  	} else {
   143  		total += noDictSize
   144  	}
   145  	total += int64(len(cs.rLevels.data) + len(cs.dLevels.data))
   146  	return total
   147  }
   148  
   149  func (cs *ColumnStore) getMaxPageSize() int64 {
   150  	if cs.maxPageSize == 0 {
   151  		return 1024 * 1024
   152  	}
   153  	return cs.maxPageSize
   154  }
   155  
   156  func (cs *ColumnStore) flushPage(sch *schema, force bool) error {
   157  	size := cs.estimateSize()
   158  
   159  	if !force && size < cs.getMaxPageSize() {
   160  		return nil
   161  	}
   162  
   163  	numRows := sch.numRecords - cs.prevNumRecords
   164  	cs.prevNumRecords = sch.numRecords
   165  
   166  	cs.dataPages = append(cs.dataPages, &dataPage{
   167  		values:     cs.values.getValues(),
   168  		rL:         cs.rLevels,
   169  		dL:         cs.dLevels,
   170  		numValues:  int64(cs.values.numValues()),
   171  		nullValues: int64(cs.values.nullValueCount()),
   172  		numRows:    numRows,
   173  		stats: &parquet.Statistics{
   174  			NullCount:     int64Ptr(int64(cs.values.nullValueCount())),
   175  			DistinctCount: int64Ptr(cs.values.distinctValueCount()),
   176  			MaxValue:      cs.getPageStats().maxValue(),
   177  			MinValue:      cs.getPageStats().minValue(),
   178  		},
   179  	})
   180  
   181  	cs.resetData()
   182  
   183  	return nil
   184  }
   185  
   186  func int64Ptr(v int64) *int64 {
   187  	return &v
   188  }
   189  
   190  // getRDLevelAt return the next rLevel in the read position, if there is no value left, it returns true
   191  // if the position is less than zero, then it returns the current position
   192  // NOTE: make sure always r is before d, in any function
   193  func (cs *ColumnStore) getRDLevelAt(pos int) (int32, int32, bool) {
   194  	if pos < 0 {
   195  		pos = cs.readPos
   196  	}
   197  	if pos >= cs.rLevels.count || pos >= cs.dLevels.count {
   198  		return 0, 0, true
   199  	}
   200  	dl, err := cs.dLevels.at(pos)
   201  	if err != nil {
   202  		return 0, 0, true
   203  	}
   204  	rl, err := cs.rLevels.at(pos)
   205  	if err != nil {
   206  		return 0, 0, true
   207  	}
   208  
   209  	return rl, dl, false
   210  }
   211  
   212  func (cs *ColumnStore) getNext() (v interface{}, err error) {
   213  	v, err = cs.values.getNextValue()
   214  	if err != nil {
   215  		return nil, err
   216  	}
   217  	return v, nil
   218  }
   219  
   220  func (cs *ColumnStore) resetData() {
   221  	cs.readPos = 0
   222  	cs.values = &dictStore{useDict: cs.useDict, alloc: cs.alloc}
   223  	cs.values.init()
   224  
   225  	rLevelBitWidth := cs.rLevels.bw
   226  	dLevelBitWidth := cs.dLevels.bw
   227  
   228  	cs.rLevels = &packedArray{}
   229  	cs.dLevels = &packedArray{}
   230  	cs.rLevels.reset(rLevelBitWidth)
   231  	cs.dLevels.reset(dLevelBitWidth)
   232  
   233  	cs.getPageStats().reset()
   234  }
   235  
   236  func (cs *ColumnStore) readNextPage() error {
   237  	if cs.pageIdx >= len(cs.pages) {
   238  		return fmt.Errorf("out of range: requested page index = %d total number of pages = %d", cs.pageIdx, len(cs.pages))
   239  	}
   240  
   241  	data, dl, rl, err := cs.pages[cs.pageIdx].readValues(int(cs.pages[cs.pageIdx].numValues()))
   242  	if err != nil {
   243  		return err
   244  	}
   245  
   246  	cs.pageIdx++
   247  
   248  	cs.resetData()
   249  
   250  	cs.values.readPos = 0
   251  
   252  	for _, v := range data {
   253  		cs.values.addValue(v, cs.sizeOf(v))
   254  	}
   255  
   256  	cs.rLevels.appendArray(rl)
   257  	cs.dLevels.appendArray(dl)
   258  
   259  	return nil
   260  }
   261  
   262  func (cs *ColumnStore) get(maxD, maxR int32) (interface{}, int32, error) {
   263  	if cs.skipped {
   264  		return nil, 0, nil
   265  	}
   266  
   267  	if cs.readPos >= cs.rLevels.count || cs.readPos >= cs.dLevels.count {
   268  		if err := cs.readNextPage(); err != nil {
   269  			return nil, 0, err
   270  		}
   271  	}
   272  	_, dl, _ := cs.getRDLevelAt(cs.readPos)
   273  	// this is a null value, increase the read pos, for advancing the rLvl and dLvl but
   274  	// do not touch the dict-store
   275  	if dl < maxD {
   276  		cs.readPos++
   277  		return nil, dl, nil
   278  	}
   279  	v, err := cs.getNext()
   280  	if err != nil {
   281  		return nil, 0, err
   282  	}
   283  
   284  	// if this is not repeated just return the value, the result is not an array
   285  	if cs.repTyp != parquet.FieldRepetitionType_REPEATED {
   286  		cs.readPos++
   287  		return v, maxD, err
   288  	}
   289  
   290  	// the first rLevel in current object is always less than maxR (only for the repeated values)
   291  	// the next data in this object, should have maxR as the rLevel. the first rLevel less than maxR means the value
   292  	// is from the next object and we should not touch it in this call
   293  
   294  	var ret = cs.typedColumnStore.append(nil, v)
   295  	for {
   296  		cs.readPos++
   297  		rl, _, last := cs.getRDLevelAt(cs.readPos)
   298  		if last || rl < maxR {
   299  			// end of this object
   300  			return ret, maxD, nil
   301  		}
   302  		v, err := cs.getNext()
   303  		if err != nil {
   304  			return nil, maxD, err
   305  		}
   306  
   307  		ret = cs.typedColumnStore.append(ret, v)
   308  	}
   309  }
   310  
   311  func newStore(typed typedColumnStore, enc parquet.Encoding, useDict bool, alloc *allocTracker) *ColumnStore {
   312  	return &ColumnStore{
   313  		enc:              enc,
   314  		useDict:          useDict,
   315  		typedColumnStore: typed,
   316  		alloc:            alloc,
   317  	}
   318  }
   319  
   320  func newPlainStore(typed typedColumnStore, alloc *allocTracker) *ColumnStore {
   321  	return newStore(typed, parquet.Encoding_PLAIN, true, alloc)
   322  }
   323  
   324  // getValuesStore is internally used for the reader
   325  func getValuesStore(typ *parquet.SchemaElement, alloc *allocTracker) (*ColumnStore, error) {
   326  	params := &ColumnParameters{
   327  		LogicalType:   typ.LogicalType,
   328  		ConvertedType: typ.ConvertedType,
   329  		TypeLength:    typ.TypeLength,
   330  		Scale:         typ.Scale,
   331  		Precision:     typ.Precision,
   332  	}
   333  
   334  	switch *typ.Type {
   335  	case parquet.Type_BOOLEAN:
   336  		return newPlainStore(&booleanStore{ColumnParameters: params}, alloc), nil
   337  	case parquet.Type_BYTE_ARRAY:
   338  		return newPlainStore(&byteArrayStore{ColumnParameters: params}, alloc), nil
   339  	case parquet.Type_FIXED_LEN_BYTE_ARRAY:
   340  		if typ.TypeLength == nil {
   341  			return nil, fmt.Errorf("type %s with nil type length", typ.Type)
   342  		}
   343  
   344  		return newPlainStore(&byteArrayStore{ColumnParameters: params}, alloc), nil
   345  
   346  	case parquet.Type_FLOAT:
   347  		return newPlainStore(&floatStore{ColumnParameters: params, stats: newFloatStats(), pageStats: newFloatStats()}, alloc), nil
   348  	case parquet.Type_DOUBLE:
   349  		return newPlainStore(&doubleStore{ColumnParameters: params, stats: newDoubleStats(), pageStats: newDoubleStats()}, alloc), nil
   350  
   351  	case parquet.Type_INT32:
   352  		return newPlainStore(&int32Store{ColumnParameters: params, stats: newInt32Stats(), pageStats: newInt32Stats()}, alloc), nil
   353  	case parquet.Type_INT64:
   354  		return newPlainStore(&int64Store{ColumnParameters: params, stats: newInt64Stats(), pageStats: newInt64Stats()}, alloc), nil
   355  	case parquet.Type_INT96:
   356  		store := &int96Store{}
   357  		store.ColumnParameters = params
   358  		return newPlainStore(store, alloc), nil
   359  	default:
   360  		return nil, fmt.Errorf("unsupported type: %s", typ.Type)
   361  	}
   362  }
   363  
   364  // NewBooleanStore creates new column store to store boolean values.
   365  func NewBooleanStore(enc parquet.Encoding, params *ColumnParameters) (*ColumnStore, error) {
   366  	switch enc {
   367  	case parquet.Encoding_PLAIN, parquet.Encoding_RLE:
   368  	default:
   369  		return nil, fmt.Errorf("encoding %q is not supported on this type", enc)
   370  	}
   371  	return newStore(&booleanStore{ColumnParameters: params}, enc, false, nil), nil // allocTracker is set by recursiveFix
   372  }
   373  
   374  // NewInt32Store create a new column store to store int32 values. If useDict is true,
   375  // then a dictionary is used, otherwise a dictionary will never be used to encode the data.
   376  func NewInt32Store(enc parquet.Encoding, useDict bool, params *ColumnParameters) (*ColumnStore, error) {
   377  	switch enc {
   378  	case parquet.Encoding_PLAIN, parquet.Encoding_DELTA_BINARY_PACKED:
   379  	default:
   380  		return nil, fmt.Errorf("encoding %q is not supported on this type", enc)
   381  	}
   382  	return newStore(&int32Store{ColumnParameters: params, stats: newInt32Stats(), pageStats: newInt32Stats()}, enc, useDict, nil), nil // allocTracker is set by recursiveFix
   383  }
   384  
   385  // NewInt64Store creates a new column store to store int64 values. If useDict is true,
   386  // then a dictionary is used, otherwise a dictionary will never be used to encode the data.
   387  func NewInt64Store(enc parquet.Encoding, useDict bool, params *ColumnParameters) (*ColumnStore, error) {
   388  	switch enc {
   389  	case parquet.Encoding_PLAIN, parquet.Encoding_DELTA_BINARY_PACKED:
   390  	default:
   391  		return nil, fmt.Errorf("encoding %q is not supported on this type", enc)
   392  	}
   393  	return newStore(&int64Store{ColumnParameters: params, stats: newInt64Stats(), pageStats: newInt64Stats()}, enc, useDict, nil), nil // allocTracker is set by recursiveFix
   394  }
   395  
   396  // NewInt96Store creates a new column store to store int96 values. If useDict is true,
   397  // then a dictionary is used, otherwise a dictionary will never be used to encode the data.
   398  func NewInt96Store(enc parquet.Encoding, useDict bool, params *ColumnParameters) (*ColumnStore, error) {
   399  	switch enc {
   400  	case parquet.Encoding_PLAIN:
   401  	default:
   402  		return nil, fmt.Errorf("encoding %q is not supported on this type", enc)
   403  	}
   404  	store := &int96Store{}
   405  	store.ColumnParameters = params
   406  	return newStore(store, enc, useDict, nil), nil // allocTracker is set by recursiveFix
   407  }
   408  
   409  // NewFloatStore creates a new column store to store float (float32) values. If useDict is true,
   410  // then a dictionary is used, otherwise a dictionary will never be used to encode the data.
   411  func NewFloatStore(enc parquet.Encoding, useDict bool, params *ColumnParameters) (*ColumnStore, error) {
   412  	switch enc {
   413  	case parquet.Encoding_PLAIN:
   414  	default:
   415  		return nil, fmt.Errorf("encoding %q is not supported on this type", enc)
   416  	}
   417  	return newStore(&floatStore{ColumnParameters: params, stats: newFloatStats(), pageStats: newFloatStats()}, enc, useDict, nil), nil // allocTracker is set by recursiveFix
   418  }
   419  
   420  // NewDoubleStore creates a new column store to store double (float64) values. If useDict is true,
   421  // then a dictionary is used, otherwise a dictionary will never be used to encode the data.
   422  func NewDoubleStore(enc parquet.Encoding, useDict bool, params *ColumnParameters) (*ColumnStore, error) {
   423  	switch enc {
   424  	case parquet.Encoding_PLAIN:
   425  	default:
   426  		return nil, fmt.Errorf("encoding %q is not supported on this type", enc)
   427  	}
   428  	return newStore(&doubleStore{ColumnParameters: params, stats: newDoubleStats(), pageStats: newDoubleStats()}, enc, useDict, nil), nil // allocTracker is set by recursiveFix
   429  }
   430  
   431  // NewByteArrayStore creates a new column store to store byte arrays. If useDict is true,
   432  // then a dictionary is used, otherwise a dictionary will never be used to encode the data.
   433  func NewByteArrayStore(enc parquet.Encoding, useDict bool, params *ColumnParameters) (*ColumnStore, error) {
   434  	switch enc {
   435  	case parquet.Encoding_PLAIN, parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY, parquet.Encoding_DELTA_BYTE_ARRAY:
   436  	default:
   437  		return nil, fmt.Errorf("encoding %q is not supported on this type", enc)
   438  	}
   439  	return newStore(&byteArrayStore{ColumnParameters: params}, enc, useDict, nil), nil // allocTracker is set by recursiveFix
   440  }
   441  
   442  // NewFixedByteArrayStore creates a new column store to store fixed size byte arrays. If useDict is true,
   443  // then a dictionary is used, otherwise a dictionary will never be used to encode the data.
   444  func NewFixedByteArrayStore(enc parquet.Encoding, useDict bool, params *ColumnParameters) (*ColumnStore, error) {
   445  	switch enc {
   446  	case parquet.Encoding_PLAIN, parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY, parquet.Encoding_DELTA_BYTE_ARRAY:
   447  	default:
   448  		return nil, fmt.Errorf("encoding %q is not supported on this type", enc)
   449  	}
   450  	if params.TypeLength == nil {
   451  		return nil, errors.New("no length provided")
   452  	}
   453  
   454  	if *params.TypeLength <= 0 {
   455  		return nil, fmt.Errorf("fix length with len %d is not possible", *params.TypeLength)
   456  	}
   457  
   458  	return newStore(&byteArrayStore{
   459  		ColumnParameters: params,
   460  	}, enc, useDict, nil), nil // allocTracker is set by recursiveFix
   461  }