github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/sorting_test.go (about) 1 //go:build go1.18 2 3 package parquet_test 4 5 import ( 6 "bytes" 7 "math/rand" 8 "sort" 9 "testing" 10 11 "github.com/segmentio/parquet-go" 12 ) 13 14 func TestSortingWriter(t *testing.T) { 15 type Row struct { 16 Value int32 `parquet:"value"` 17 } 18 19 rows := make([]Row, 1000) 20 for i := range rows { 21 rows[i].Value = int32(i) 22 } 23 24 prng := rand.New(rand.NewSource(0)) 25 prng.Shuffle(len(rows), func(i, j int) { 26 rows[i], rows[j] = rows[j], rows[i] 27 }) 28 29 buffer := bytes.NewBuffer(nil) 30 writer := parquet.NewSortingWriter[Row](buffer, 99, 31 parquet.SortingWriterConfig( 32 parquet.SortingColumns( 33 parquet.Ascending("value"), 34 ), 35 ), 36 ) 37 38 _, err := writer.Write(rows) 39 if err != nil { 40 t.Fatal(err) 41 } 42 43 if err := writer.Close(); err != nil { 44 t.Fatal(err) 45 } 46 47 read, err := parquet.Read[Row](bytes.NewReader(buffer.Bytes()), int64(buffer.Len())) 48 if err != nil { 49 t.Fatal(err) 50 } 51 52 sort.Slice(rows, func(i, j int) bool { 53 return rows[i].Value < rows[j].Value 54 }) 55 56 assertRowsEqual(t, rows, read) 57 } 58 59 func TestSortingWriterDropDuplicatedRows(t *testing.T) { 60 type Row struct { 61 Value int32 `parquet:"value"` 62 } 63 64 rows := make([]Row, 1000) 65 for i := range rows { 66 rows[i].Value = int32(i / 2) 67 } 68 69 prng := rand.New(rand.NewSource(0)) 70 prng.Shuffle(len(rows), func(i, j int) { 71 rows[i], rows[j] = rows[j], rows[i] 72 }) 73 74 buffer := bytes.NewBuffer(nil) 75 writer := parquet.NewSortingWriter[Row](buffer, 99, 76 parquet.SortingWriterConfig( 77 parquet.SortingBuffers( 78 parquet.NewFileBufferPool("", "buffers.*"), 79 ), 80 parquet.SortingColumns( 81 parquet.Ascending("value"), 82 ), 83 parquet.DropDuplicatedRows(true), 84 ), 85 ) 86 87 _, err := writer.Write(rows) 88 if err != nil { 89 t.Fatal(err) 90 } 91 92 if err := writer.Close(); err != nil { 93 t.Fatal(err) 94 } 95 96 read, err := parquet.Read[Row](bytes.NewReader(buffer.Bytes()), int64(buffer.Len())) 97 if err != nil { 98 t.Fatal(err) 99 } 100 101 sort.Slice(rows, func(i, j int) bool { 102 return rows[i].Value < rows[j].Value 103 }) 104 105 n := len(rows) / 2 106 for i := range rows[:n] { 107 rows[i] = rows[2*i] 108 } 109 110 assertRowsEqual(t, rows[:n], read) 111 }