github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/dedupe_test.go (about) 1 package parquet_test 2 3 import ( 4 "sort" 5 "testing" 6 7 "github.com/parquet-go/parquet-go" 8 ) 9 10 func TestDedupeRowReader(t *testing.T) { 11 type Row struct { 12 Value int32 `parquet:"value"` 13 } 14 15 rows := make([]Row, 1000) 16 for i := range rows { 17 rows[i].Value = int32(i / 3) 18 } 19 20 dedupeMap := make(map[Row]struct{}, len(rows)) 21 for _, row := range rows { 22 dedupeMap[row] = struct{}{} 23 } 24 25 dedupeRows := make([]Row, 0, len(dedupeMap)) 26 for row := range dedupeMap { 27 dedupeRows = append(dedupeRows, row) 28 } 29 30 sort.Slice(dedupeRows, func(i, j int) bool { 31 return dedupeRows[i].Value < dedupeRows[j].Value 32 }) 33 34 buffer1 := parquet.NewRowBuffer[Row]() 35 buffer1.Write(rows) 36 37 buffer1Rows := buffer1.Rows() 38 defer buffer1Rows.Close() 39 40 buffer2 := parquet.NewRowBuffer[Row]() 41 42 _, err := parquet.CopyRows(buffer2, 43 parquet.DedupeRowReader(buffer1Rows, 44 buffer1.Schema().Comparator(parquet.Ascending("value")), 45 ), 46 ) 47 if err != nil { 48 t.Fatal(err) 49 } 50 51 reader := parquet.NewGenericRowGroupReader[Row](buffer2) 52 defer reader.Close() 53 54 n, _ := reader.Read(rows) 55 assertRowsEqual(t, dedupeRows, rows[:n]) 56 } 57 58 func TestDedupeRowWriter(t *testing.T) { 59 type Row struct { 60 Value int32 `parquet:"value"` 61 } 62 63 rows := make([]Row, 1000) 64 for i := range rows { 65 rows[i].Value = int32(i / 3) 66 } 67 68 dedupeMap := make(map[Row]struct{}, len(rows)) 69 for _, row := range rows { 70 dedupeMap[row] = struct{}{} 71 } 72 73 dedupeRows := make([]Row, 0, len(dedupeMap)) 74 for row := range dedupeMap { 75 dedupeRows = append(dedupeRows, row) 76 } 77 78 sort.Slice(dedupeRows, func(i, j int) bool { 79 return dedupeRows[i].Value < dedupeRows[j].Value 80 }) 81 82 buffer1 := parquet.NewRowBuffer[Row]() 83 buffer1.Write(rows) 84 85 buffer1Rows := buffer1.Rows() 86 defer buffer1Rows.Close() 87 88 buffer2 := parquet.NewRowBuffer[Row]() 89 90 _, err := parquet.CopyRows( 91 parquet.DedupeRowWriter(buffer2, 92 buffer1.Schema().Comparator(parquet.Ascending("value")), 93 ), 94 buffer1Rows, 95 ) 96 if err != nil { 97 t.Fatal(err) 98 } 99 100 reader := parquet.NewGenericRowGroupReader[Row](buffer2) 101 defer reader.Close() 102 103 n, _ := reader.Read(rows) 104 assertRowsEqual(t, dedupeRows, rows[:n]) 105 }