go.charczuk.com@v0.0.0-20240327042549-bc490516bd1a/projects/nodes/pkg/funcs/read_parquet.go (about)

     1  /*
     2  
     3  Copyright (c) 2023 - Present. Will Charczuk. All rights reserved.
     4  Use of this source code is governed by a MIT license that can be found in the LICENSE file at the root of the repository.
     5  
     6  */
     7  
     8  package funcs
     9  
    10  import (
    11  	"context"
    12  	"os"
    13  	"time"
    14  
    15  	parquet "github.com/parquet-go/parquet-go"
    16  
    17  	"go.charczuk.com/projects/nodes/pkg/incrutil"
    18  	"go.charczuk.com/projects/nodes/pkg/types"
    19  	"go.charczuk.com/sdk/iter"
    20  )
    21  
    22  const defaultMaxParquetFileSize = 32 << 20 // 32mb
    23  
    24  func ReadParquet(ctx context.Context, filepath string, modtime time.Time) (output *types.Table, err error) {
    25  	var f *os.File
    26  	f, err = os.Open(filepath)
    27  	if err != nil {
    28  		return
    29  	}
    30  	defer f.Close()
    31  
    32  	var pf *parquet.File
    33  	pf, err = parquet.OpenFile(f, defaultMaxParquetFileSize)
    34  	if err != nil {
    35  		return
    36  	}
    37  
    38  	output = new(types.Table)
    39  
    40  	columns := pf.Root().Columns()
    41  	for _, column := range columns {
    42  		newCol := types.TableColumn{
    43  			Name: column.Name(),
    44  		}
    45  		pages := column.Pages()
    46  		var page parquet.Page
    47  		for {
    48  			page, err = pages.ReadPage()
    49  			if err != nil {
    50  				break
    51  			}
    52  			switch typedPage := page.Values().(type) {
    53  			case parquet.BooleanReader:
    54  				values := make([]bool, page.NumValues())
    55  				typedPage.ReadBooleans(values)
    56  				newCol.ValueType = incrutil.APIValueTypeForGoType(incrutil.ValueTypeBool)
    57  				newCol.Values = append(newCol.Values, iter.Apply(values, func(v bool) any { return v })...)
    58  			case parquet.Int64Reader:
    59  				values := make([]int64, page.NumValues())
    60  				typedPage.ReadInt64s(values)
    61  				newCol.ValueType = incrutil.APIValueTypeForGoType(incrutil.ValueTypeInt64)
    62  				newCol.Values = append(newCol.Values, iter.Apply(values, func(v int64) any { return v })...)
    63  			case parquet.DoubleReader:
    64  				values := make([]float64, page.NumValues())
    65  				typedPage.ReadDoubles(values)
    66  				newCol.ValueType = incrutil.APIValueTypeForGoType(incrutil.ValueTypeFloat64)
    67  				newCol.Values = append(newCol.Values, iter.Apply(values, func(v float64) any { return v })...)
    68  			default:
    69  				values := make([]parquet.Value, page.NumValues())
    70  				typedPage.ReadValues(values)
    71  				newCol.ValueType = incrutil.APIValueTypeForGoType(incrutil.ValueTypeString)
    72  				newCol.Values = append(newCol.Values, iter.Apply(values, func(v parquet.Value) any { return v.GoString() })...)
    73  			}
    74  		}
    75  		output.Columns = append(output.Columns, newCol)
    76  	}
    77  	return
    78  }