github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/merge_test.go (about)

     1  package parquet_test
     2  
     3  import (
     4  	"bytes"
     5  	"errors"
     6  	"fmt"
     7  	"io"
     8  	"math/rand"
     9  	"sort"
    10  	"testing"
    11  
    12  	"github.com/vc42/parquet-go"
    13  )
    14  
    15  const (
    16  	numRowGroups = 3
    17  	rowsPerGroup = benchmarkNumRows
    18  )
    19  
    20  func BenchmarkMergeRowGroups(b *testing.B) {
    21  	for _, test := range readerTests {
    22  		b.Run(test.scenario, func(b *testing.B) {
    23  			schema := parquet.SchemaOf(test.model)
    24  
    25  			options := []parquet.RowGroupOption{
    26  				parquet.SortingColumns(
    27  					parquet.Ascending(schema.Columns()[0]...),
    28  				),
    29  			}
    30  
    31  			prng := rand.New(rand.NewSource(0))
    32  			rowGroups := make([]parquet.RowGroup, numRowGroups)
    33  
    34  			for i := range rowGroups {
    35  				rowGroups[i] = sortedRowGroup(options, randomRowsOf(prng, rowsPerGroup, test.model)...)
    36  			}
    37  
    38  			for n := 1; n <= numRowGroups; n++ {
    39  				b.Run(fmt.Sprintf("groups=%d,rows=%d", n, n*rowsPerGroup), func(b *testing.B) {
    40  					mergedRowGroup, err := parquet.MergeRowGroups(rowGroups[:n])
    41  					if err != nil {
    42  						b.Fatal(err)
    43  					}
    44  
    45  					rows := mergedRowGroup.Rows()
    46  					rbuf := make([]parquet.Row, benchmarkRowsPerStep)
    47  					defer func() { rows.Close() }()
    48  
    49  					benchmarkRowsPerSecond(b, func() int {
    50  						n, err := rows.ReadRows(rbuf)
    51  						if err != nil {
    52  							if !errors.Is(err, io.EOF) {
    53  								b.Fatal(err)
    54  							}
    55  							rows.Close()
    56  							rows = mergedRowGroup.Rows()
    57  						}
    58  						return n
    59  					})
    60  				})
    61  			}
    62  		})
    63  	}
    64  }
    65  
    66  func BenchmarkMergeFiles(b *testing.B) {
    67  	rowGroupBuffers := make([]bytes.Buffer, numRowGroups)
    68  
    69  	for _, test := range readerTests {
    70  		b.Run(test.scenario, func(b *testing.B) {
    71  			schema := parquet.SchemaOf(test.model)
    72  
    73  			buffer := parquet.NewBuffer(
    74  				schema,
    75  				parquet.SortingColumns(
    76  					parquet.Ascending(schema.Columns()[0]...),
    77  				),
    78  			)
    79  
    80  			prng := rand.New(rand.NewSource(0))
    81  			files := make([]*parquet.File, numRowGroups)
    82  			rowGroups := make([]parquet.RowGroup, numRowGroups)
    83  
    84  			for i := range files {
    85  				for _, row := range randomRowsOf(prng, rowsPerGroup, test.model) {
    86  					buffer.Write(row)
    87  				}
    88  				sort.Sort(buffer)
    89  				rowGroupBuffers[i].Reset()
    90  				writer := parquet.NewWriter(&rowGroupBuffers[i])
    91  				_, err := copyRowsAndClose(writer, buffer.Rows())
    92  				if err != nil {
    93  					b.Fatal(err)
    94  				}
    95  				if err := writer.Close(); err != nil {
    96  					b.Fatal(err)
    97  				}
    98  				r := bytes.NewReader(rowGroupBuffers[i].Bytes())
    99  				f, err := parquet.OpenFile(r, r.Size())
   100  				if err != nil {
   101  					b.Fatal(err)
   102  				}
   103  				files[i], rowGroups[i] = f, f.RowGroups()[0]
   104  			}
   105  
   106  			for n := 1; n <= numRowGroups; n++ {
   107  				b.Run(fmt.Sprintf("groups=%d,rows=%d", n, n*rowsPerGroup), func(b *testing.B) {
   108  					mergedRowGroup, err := parquet.MergeRowGroups(rowGroups[:n])
   109  					if err != nil {
   110  						b.Fatal(err)
   111  					}
   112  
   113  					rows := mergedRowGroup.Rows()
   114  					rbuf := make([]parquet.Row, benchmarkRowsPerStep)
   115  					defer func() { rows.Close() }()
   116  
   117  					benchmarkRowsPerSecond(b, func() int {
   118  						n, err := rows.ReadRows(rbuf)
   119  						if err != nil {
   120  							if !errors.Is(err, io.EOF) {
   121  								b.Fatal(err)
   122  							}
   123  							rows.Close()
   124  							rows = mergedRowGroup.Rows()
   125  						}
   126  						return n
   127  					})
   128  
   129  					totalSize := int64(0)
   130  					for _, f := range files[:n] {
   131  						totalSize += f.Size()
   132  					}
   133  				})
   134  			}
   135  		})
   136  	}
   137  }