github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/dedupe_test.go (about) 1 //go:build go1.18 2 3 package parquet_test 4 5 import ( 6 "sort" 7 "testing" 8 9 "github.com/segmentio/parquet-go" 10 ) 11 12 func TestDedupeRowReader(t *testing.T) { 13 type Row struct { 14 Value int32 `parquet:"value"` 15 } 16 17 rows := make([]Row, 1000) 18 for i := range rows { 19 rows[i].Value = int32(i / 3) 20 } 21 22 dedupeMap := make(map[Row]struct{}, len(rows)) 23 for _, row := range rows { 24 dedupeMap[row] = struct{}{} 25 } 26 27 dedupeRows := make([]Row, 0, len(dedupeMap)) 28 for row := range dedupeMap { 29 dedupeRows = append(dedupeRows, row) 30 } 31 32 sort.Slice(dedupeRows, func(i, j int) bool { 33 return dedupeRows[i].Value < dedupeRows[j].Value 34 }) 35 36 buffer1 := parquet.NewRowBuffer[Row]() 37 buffer1.Write(rows) 38 39 buffer1Rows := buffer1.Rows() 40 defer buffer1Rows.Close() 41 42 buffer2 := parquet.NewRowBuffer[Row]() 43 44 _, err := parquet.CopyRows(buffer2, 45 parquet.DedupeRowReader(buffer1Rows, 46 buffer1.Schema().Comparator(parquet.Ascending("value")), 47 ), 48 ) 49 if err != nil { 50 t.Fatal(err) 51 } 52 53 reader := parquet.NewGenericRowGroupReader[Row](buffer2) 54 defer reader.Close() 55 56 n, _ := reader.Read(rows) 57 assertRowsEqual(t, dedupeRows, rows[:n]) 58 } 59 60 func TestDedupeRowWriter(t *testing.T) { 61 type Row struct { 62 Value int32 `parquet:"value"` 63 } 64 65 rows := make([]Row, 1000) 66 for i := range rows { 67 rows[i].Value = int32(i / 3) 68 } 69 70 dedupeMap := make(map[Row]struct{}, len(rows)) 71 for _, row := range rows { 72 dedupeMap[row] = struct{}{} 73 } 74 75 dedupeRows := make([]Row, 0, len(dedupeMap)) 76 for row := range dedupeMap { 77 dedupeRows = append(dedupeRows, row) 78 } 79 80 sort.Slice(dedupeRows, func(i, j int) bool { 81 return dedupeRows[i].Value < dedupeRows[j].Value 82 }) 83 84 buffer1 := parquet.NewRowBuffer[Row]() 85 buffer1.Write(rows) 86 87 buffer1Rows := buffer1.Rows() 88 defer buffer1Rows.Close() 89 90 buffer2 := parquet.NewRowBuffer[Row]() 91 92 _, err := parquet.CopyRows( 93 parquet.DedupeRowWriter(buffer2, 94 buffer1.Schema().Comparator(parquet.Ascending("value")), 95 ), 96 buffer1Rows, 97 ) 98 if err != nil { 99 t.Fatal(err) 100 } 101 102 reader := parquet.NewGenericRowGroupReader[Row](buffer2) 103 defer reader.Close() 104 105 n, _ := reader.Read(rows) 106 assertRowsEqual(t, dedupeRows, rows[:n]) 107 }