github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/merge_test.go (about) 1 package parquet_test 2 3 import ( 4 "bytes" 5 "errors" 6 "fmt" 7 "io" 8 "math/rand" 9 "sort" 10 "testing" 11 12 "github.com/vc42/parquet-go" 13 ) 14 15 const ( 16 numRowGroups = 3 17 rowsPerGroup = benchmarkNumRows 18 ) 19 20 func BenchmarkMergeRowGroups(b *testing.B) { 21 for _, test := range readerTests { 22 b.Run(test.scenario, func(b *testing.B) { 23 schema := parquet.SchemaOf(test.model) 24 25 options := []parquet.RowGroupOption{ 26 parquet.SortingColumns( 27 parquet.Ascending(schema.Columns()[0]...), 28 ), 29 } 30 31 prng := rand.New(rand.NewSource(0)) 32 rowGroups := make([]parquet.RowGroup, numRowGroups) 33 34 for i := range rowGroups { 35 rowGroups[i] = sortedRowGroup(options, randomRowsOf(prng, rowsPerGroup, test.model)...) 36 } 37 38 for n := 1; n <= numRowGroups; n++ { 39 b.Run(fmt.Sprintf("groups=%d,rows=%d", n, n*rowsPerGroup), func(b *testing.B) { 40 mergedRowGroup, err := parquet.MergeRowGroups(rowGroups[:n]) 41 if err != nil { 42 b.Fatal(err) 43 } 44 45 rows := mergedRowGroup.Rows() 46 rbuf := make([]parquet.Row, benchmarkRowsPerStep) 47 defer func() { rows.Close() }() 48 49 benchmarkRowsPerSecond(b, func() int { 50 n, err := rows.ReadRows(rbuf) 51 if err != nil { 52 if !errors.Is(err, io.EOF) { 53 b.Fatal(err) 54 } 55 rows.Close() 56 rows = mergedRowGroup.Rows() 57 } 58 return n 59 }) 60 }) 61 } 62 }) 63 } 64 } 65 66 func BenchmarkMergeFiles(b *testing.B) { 67 rowGroupBuffers := make([]bytes.Buffer, numRowGroups) 68 69 for _, test := range readerTests { 70 b.Run(test.scenario, func(b *testing.B) { 71 schema := parquet.SchemaOf(test.model) 72 73 buffer := parquet.NewBuffer( 74 schema, 75 parquet.SortingColumns( 76 parquet.Ascending(schema.Columns()[0]...), 77 ), 78 ) 79 80 prng := rand.New(rand.NewSource(0)) 81 files := make([]*parquet.File, numRowGroups) 82 rowGroups := make([]parquet.RowGroup, numRowGroups) 83 84 for i := range files { 85 for _, row := range randomRowsOf(prng, rowsPerGroup, test.model) { 86 buffer.Write(row) 87 } 88 sort.Sort(buffer) 89 rowGroupBuffers[i].Reset() 90 writer := parquet.NewWriter(&rowGroupBuffers[i]) 91 _, err := copyRowsAndClose(writer, buffer.Rows()) 92 if err != nil { 93 b.Fatal(err) 94 } 95 if err := writer.Close(); err != nil { 96 b.Fatal(err) 97 } 98 r := bytes.NewReader(rowGroupBuffers[i].Bytes()) 99 f, err := parquet.OpenFile(r, r.Size()) 100 if err != nil { 101 b.Fatal(err) 102 } 103 files[i], rowGroups[i] = f, f.RowGroups()[0] 104 } 105 106 for n := 1; n <= numRowGroups; n++ { 107 b.Run(fmt.Sprintf("groups=%d,rows=%d", n, n*rowsPerGroup), func(b *testing.B) { 108 mergedRowGroup, err := parquet.MergeRowGroups(rowGroups[:n]) 109 if err != nil { 110 b.Fatal(err) 111 } 112 113 rows := mergedRowGroup.Rows() 114 rbuf := make([]parquet.Row, benchmarkRowsPerStep) 115 defer func() { rows.Close() }() 116 117 benchmarkRowsPerSecond(b, func() int { 118 n, err := rows.ReadRows(rbuf) 119 if err != nil { 120 if !errors.Is(err, io.EOF) { 121 b.Fatal(err) 122 } 123 rows.Close() 124 rows = mergedRowGroup.Rows() 125 } 126 return n 127 }) 128 129 totalSize := int64(0) 130 for _, f := range files[:n] { 131 totalSize += f.Size() 132 } 133 }) 134 } 135 }) 136 } 137 }