github.com/fraugster/parquet-go@v0.12.0/chunk_writer.go (about)

     1  package goparquet
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"fmt"
     7  	"math"
     8  	"sort"
     9  
    10  	"github.com/fraugster/parquet-go/parquet"
    11  )
    12  
    13  func getBooleanValuesEncoder(pageEncoding parquet.Encoding) (valuesEncoder, error) {
    14  	switch pageEncoding {
    15  	case parquet.Encoding_PLAIN:
    16  		return &booleanPlainEncoder{}, nil
    17  	case parquet.Encoding_RLE:
    18  		return &booleanRLEEncoder{}, nil
    19  	default:
    20  		return nil, fmt.Errorf("unsupported encoding %s for boolean", pageEncoding)
    21  	}
    22  }
    23  
    24  func getByteArrayValuesEncoder(pageEncoding parquet.Encoding, dictValues []interface{}) (valuesEncoder, error) {
    25  	switch pageEncoding {
    26  	case parquet.Encoding_PLAIN:
    27  		return &byteArrayPlainEncoder{}, nil
    28  	case parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY:
    29  		return &byteArrayDeltaLengthEncoder{}, nil
    30  	case parquet.Encoding_DELTA_BYTE_ARRAY:
    31  		return &byteArrayDeltaEncoder{}, nil
    32  	default:
    33  		return nil, fmt.Errorf("unsupported encoding %s for binary", pageEncoding)
    34  	}
    35  }
    36  
    37  func getFixedLenByteArrayValuesEncoder(pageEncoding parquet.Encoding, len int, dictValues []interface{}) (valuesEncoder, error) {
    38  	switch pageEncoding {
    39  	case parquet.Encoding_PLAIN:
    40  		return &byteArrayPlainEncoder{length: len}, nil
    41  	case parquet.Encoding_DELTA_BYTE_ARRAY:
    42  		return &byteArrayDeltaEncoder{}, nil
    43  	default:
    44  		return nil, fmt.Errorf("unsupported encoding %s for fixed_len_byte_array(%d)", pageEncoding, len)
    45  	}
    46  }
    47  
    48  func getInt32ValuesEncoder(pageEncoding parquet.Encoding, typ *parquet.SchemaElement, dictValues []interface{}) (valuesEncoder, error) {
    49  	switch pageEncoding {
    50  	case parquet.Encoding_PLAIN:
    51  		return &int32PlainEncoder{}, nil
    52  	case parquet.Encoding_DELTA_BINARY_PACKED:
    53  		return &int32DeltaBPEncoder{
    54  			deltaBitPackEncoder32: deltaBitPackEncoder32{
    55  				blockSize:      128,
    56  				miniBlockCount: 4,
    57  			},
    58  		}, nil
    59  	default:
    60  		return nil, fmt.Errorf("unsupported encoding %s for int32", pageEncoding)
    61  	}
    62  }
    63  
    64  func getInt64ValuesEncoder(pageEncoding parquet.Encoding, typ *parquet.SchemaElement, dictValues []interface{}) (valuesEncoder, error) {
    65  	switch pageEncoding {
    66  	case parquet.Encoding_PLAIN:
    67  		return &int64PlainEncoder{}, nil
    68  	case parquet.Encoding_DELTA_BINARY_PACKED:
    69  		return &int64DeltaBPEncoder{
    70  			deltaBitPackEncoder64: deltaBitPackEncoder64{
    71  				blockSize:      128,
    72  				miniBlockCount: 4,
    73  			},
    74  		}, nil
    75  	default:
    76  		return nil, fmt.Errorf("unsupported encoding %s for int64", pageEncoding)
    77  	}
    78  }
    79  
    80  func getValuesEncoder(pageEncoding parquet.Encoding, typ *parquet.SchemaElement, dictValues []interface{}) (valuesEncoder, error) {
    81  	// Change the deprecated value
    82  	if pageEncoding == parquet.Encoding_PLAIN_DICTIONARY {
    83  		pageEncoding = parquet.Encoding_RLE_DICTIONARY
    84  	}
    85  
    86  	switch *typ.Type {
    87  	case parquet.Type_BOOLEAN:
    88  		return getBooleanValuesEncoder(pageEncoding)
    89  
    90  	case parquet.Type_BYTE_ARRAY:
    91  		return getByteArrayValuesEncoder(pageEncoding, dictValues)
    92  
    93  	case parquet.Type_FIXED_LEN_BYTE_ARRAY:
    94  		if typ.TypeLength == nil {
    95  			return nil, fmt.Errorf("type %s with nil type len", typ.Type)
    96  		}
    97  		return getFixedLenByteArrayValuesEncoder(pageEncoding, int(*typ.TypeLength), dictValues)
    98  
    99  	case parquet.Type_FLOAT:
   100  		switch pageEncoding {
   101  		case parquet.Encoding_PLAIN:
   102  			return &floatPlainEncoder{}, nil
   103  		}
   104  
   105  	case parquet.Type_DOUBLE:
   106  		switch pageEncoding {
   107  		case parquet.Encoding_PLAIN:
   108  			return &doublePlainEncoder{}, nil
   109  		}
   110  
   111  	case parquet.Type_INT32:
   112  		return getInt32ValuesEncoder(pageEncoding, typ, dictValues)
   113  
   114  	case parquet.Type_INT64:
   115  		return getInt64ValuesEncoder(pageEncoding, typ, dictValues)
   116  
   117  	case parquet.Type_INT96:
   118  		switch pageEncoding {
   119  		case parquet.Encoding_PLAIN:
   120  			return &int96PlainEncoder{}, nil
   121  		}
   122  
   123  	default:
   124  		return nil, fmt.Errorf("unsupported type: %s", typ.Type)
   125  	}
   126  
   127  	return nil, fmt.Errorf("unsupported encoding %s for %s type", pageEncoding, typ.Type)
   128  }
   129  
   130  func getDictValuesEncoder(typ *parquet.SchemaElement) (valuesEncoder, error) {
   131  	switch *typ.Type {
   132  	case parquet.Type_BYTE_ARRAY:
   133  		return &byteArrayPlainEncoder{}, nil
   134  	case parquet.Type_FIXED_LEN_BYTE_ARRAY:
   135  		if typ.TypeLength == nil {
   136  			return nil, fmt.Errorf("type %s with nil type len", typ)
   137  		}
   138  		return &byteArrayPlainEncoder{length: int(*typ.TypeLength)}, nil
   139  	case parquet.Type_FLOAT:
   140  		return &floatPlainEncoder{}, nil
   141  	case parquet.Type_DOUBLE:
   142  		return &doublePlainEncoder{}, nil
   143  	case parquet.Type_INT32:
   144  		return &int32PlainEncoder{}, nil
   145  	case parquet.Type_INT64:
   146  		return &int64PlainEncoder{}, nil
   147  	case parquet.Type_INT96:
   148  		return &int96PlainEncoder{}, nil
   149  	}
   150  
   151  	return nil, fmt.Errorf("type %s is not supported for dict value encoder", typ)
   152  }
   153  
   154  func writeChunk(ctx context.Context, w writePos, sch *schema, col *Column, codec parquet.CompressionCodec, pageFn newDataPageFunc, kvMetaData map[string]string) (*parquet.ColumnChunk, error) {
   155  	pos := w.Pos() // Save the position before writing data
   156  	chunkOffset := pos
   157  	var (
   158  		dictPageOffset *int64
   159  		// NOTE :
   160  		// This is documentation on these two field :
   161  		//  - TotalUncompressedSize: total byte size of all uncompressed pages in this column chunk (including the headers) *
   162  		//  - TotalCompressedSize: total byte size of all compressed pages in this column chunk (including the headers) *
   163  		// the including header part is confusing. for uncompressed size, we can use the position, but for the compressed
   164  		// the only value we have doesn't contain the header
   165  		totalComp   int64
   166  		totalUnComp int64
   167  	)
   168  
   169  	// flush final data page before writing dictionary page (if applicable) and all data pages.
   170  	if err := col.data.flushPage(sch, true); err != nil {
   171  		return nil, err
   172  	}
   173  
   174  	dictValues := []interface{}{}
   175  	indices := map[interface{}]int32{}
   176  	useDict := true
   177  
   178  	if *col.Type() == parquet.Type_BOOLEAN { // never ever use dictionary encoding on booleans.
   179  		useDict = false
   180  	}
   181  	if !col.data.useDictionary() {
   182  		useDict = false
   183  	}
   184  
   185  	if useDict {
   186  	outerLoop:
   187  		for _, page := range col.data.dataPages {
   188  			if page.stats.DistinctCount != nil && *page.stats.DistinctCount > math.MaxInt16 {
   189  				useDict = false
   190  				break outerLoop
   191  			}
   192  			for _, v := range page.values {
   193  				k := mapKey(v)
   194  				if idx, ok := indices[k]; !ok {
   195  					idx = int32(len(dictValues))
   196  					indices[k] = idx
   197  					dictValues = append(dictValues, v)
   198  					page.indexList = append(page.indexList, idx)
   199  
   200  					if len(dictValues) > math.MaxInt16 {
   201  						useDict = false
   202  						break outerLoop
   203  					}
   204  				} else {
   205  					page.indexList = append(page.indexList, idx)
   206  				}
   207  			}
   208  		}
   209  	}
   210  
   211  	if useDict {
   212  		tmp := pos // make a copy, do not use the pos here
   213  		dictPageOffset = &tmp
   214  		dict := &dictPageWriter{}
   215  		if err := dict.init(sch, col, codec, dictValues); err != nil {
   216  			return nil, err
   217  		}
   218  		compSize, unCompSize, err := dict.write(ctx, w)
   219  		if err != nil {
   220  			return nil, err
   221  		}
   222  		totalComp = w.Pos() - pos
   223  		// Header size plus the rLevel and dLevel size
   224  		headerSize := totalComp - int64(compSize)
   225  		totalUnComp = int64(unCompSize) + headerSize
   226  		pos = w.Pos() // Move position for data pos
   227  	}
   228  
   229  	var (
   230  		compSize, unCompSize  int
   231  		numValues, nullValues int64
   232  	)
   233  
   234  	for _, page := range col.data.dataPages {
   235  		pw := pageFn(useDict, dictValues, page, sch.enableCRC)
   236  
   237  		if err := pw.init(col, codec); err != nil {
   238  			return nil, err
   239  		}
   240  
   241  		var buf bytes.Buffer
   242  
   243  		compressed, uncompressed, err := pw.write(ctx, &buf)
   244  		if err != nil {
   245  			return nil, err
   246  		}
   247  
   248  		compSize += compressed
   249  		unCompSize += uncompressed
   250  		numValues += page.numValues
   251  		nullValues += page.nullValues
   252  		if _, err := w.Write(buf.Bytes()); err != nil {
   253  			return nil, err
   254  		}
   255  	}
   256  
   257  	col.data.dataPages = nil
   258  
   259  	totalComp += w.Pos() - pos
   260  	// Header size plus the rLevel and dLevel size
   261  	headerSize := totalComp - int64(compSize)
   262  	totalUnComp += int64(unCompSize) + headerSize
   263  
   264  	encodings := make([]parquet.Encoding, 0, 3)
   265  	encodings = append(encodings,
   266  		parquet.Encoding_RLE,
   267  		col.data.encoding(),
   268  	)
   269  	if useDict {
   270  		encodings[1] = parquet.Encoding_PLAIN // In dictionary we use PLAIN for the data, not the column encoding
   271  		encodings = append(encodings, parquet.Encoding_RLE_DICTIONARY)
   272  	}
   273  
   274  	keyValueMetaData := make([]*parquet.KeyValue, 0, len(kvMetaData))
   275  	for k, v := range kvMetaData {
   276  		value := v
   277  		keyValueMetaData = append(keyValueMetaData, &parquet.KeyValue{Key: k, Value: &value})
   278  	}
   279  	sort.Slice(keyValueMetaData, func(i, j int) bool {
   280  		return keyValueMetaData[i].Key < keyValueMetaData[j].Key
   281  	})
   282  
   283  	distinctCount := int64(len(dictValues))
   284  
   285  	stats := &parquet.Statistics{
   286  		MinValue:      col.data.getStats().minValue(),
   287  		MaxValue:      col.data.getStats().maxValue(),
   288  		NullCount:     &nullValues,
   289  		DistinctCount: &distinctCount,
   290  	}
   291  
   292  	ch := &parquet.ColumnChunk{
   293  		FilePath:   nil, // No support for external
   294  		FileOffset: chunkOffset,
   295  		MetaData: &parquet.ColumnMetaData{
   296  			Type:                  col.data.parquetType(),
   297  			Encodings:             encodings,
   298  			PathInSchema:          col.path,
   299  			Codec:                 codec,
   300  			NumValues:             numValues + nullValues,
   301  			TotalUncompressedSize: totalUnComp,
   302  			TotalCompressedSize:   totalComp,
   303  			KeyValueMetadata:      keyValueMetaData,
   304  			DataPageOffset:        pos,
   305  			IndexPageOffset:       nil,
   306  			DictionaryPageOffset:  dictPageOffset,
   307  			Statistics:            stats,
   308  			EncodingStats:         nil,
   309  		},
   310  		OffsetIndexOffset: nil,
   311  		OffsetIndexLength: nil,
   312  		ColumnIndexOffset: nil,
   313  		ColumnIndexLength: nil,
   314  	}
   315  
   316  	return ch, nil
   317  }
   318  
   319  func writeRowGroup(ctx context.Context, w writePos, sch *schema, codec parquet.CompressionCodec, pageFn newDataPageFunc, h *flushRowGroupOptionHandle) ([]*parquet.ColumnChunk, error) {
   320  	dataCols := sch.Columns()
   321  	var res = make([]*parquet.ColumnChunk, 0, len(dataCols))
   322  	for _, ci := range dataCols {
   323  		ch, err := writeChunk(ctx, w, sch, ci, codec, pageFn, h.getMetaData(ci.Path()))
   324  		if err != nil {
   325  			return nil, err
   326  		}
   327  
   328  		res = append(res, ch)
   329  	}
   330  
   331  	return res, nil
   332  }