github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/dedupe_test.go (about)

     1  package parquet_test
     2  
     3  import (
     4  	"sort"
     5  	"testing"
     6  
     7  	"github.com/parquet-go/parquet-go"
     8  )
     9  
    10  func TestDedupeRowReader(t *testing.T) {
    11  	type Row struct {
    12  		Value int32 `parquet:"value"`
    13  	}
    14  
    15  	rows := make([]Row, 1000)
    16  	for i := range rows {
    17  		rows[i].Value = int32(i / 3)
    18  	}
    19  
    20  	dedupeMap := make(map[Row]struct{}, len(rows))
    21  	for _, row := range rows {
    22  		dedupeMap[row] = struct{}{}
    23  	}
    24  
    25  	dedupeRows := make([]Row, 0, len(dedupeMap))
    26  	for row := range dedupeMap {
    27  		dedupeRows = append(dedupeRows, row)
    28  	}
    29  
    30  	sort.Slice(dedupeRows, func(i, j int) bool {
    31  		return dedupeRows[i].Value < dedupeRows[j].Value
    32  	})
    33  
    34  	buffer1 := parquet.NewRowBuffer[Row]()
    35  	buffer1.Write(rows)
    36  
    37  	buffer1Rows := buffer1.Rows()
    38  	defer buffer1Rows.Close()
    39  
    40  	buffer2 := parquet.NewRowBuffer[Row]()
    41  
    42  	_, err := parquet.CopyRows(buffer2,
    43  		parquet.DedupeRowReader(buffer1Rows,
    44  			buffer1.Schema().Comparator(parquet.Ascending("value")),
    45  		),
    46  	)
    47  	if err != nil {
    48  		t.Fatal(err)
    49  	}
    50  
    51  	reader := parquet.NewGenericRowGroupReader[Row](buffer2)
    52  	defer reader.Close()
    53  
    54  	n, _ := reader.Read(rows)
    55  	assertRowsEqual(t, dedupeRows, rows[:n])
    56  }
    57  
    58  func TestDedupeRowWriter(t *testing.T) {
    59  	type Row struct {
    60  		Value int32 `parquet:"value"`
    61  	}
    62  
    63  	rows := make([]Row, 1000)
    64  	for i := range rows {
    65  		rows[i].Value = int32(i / 3)
    66  	}
    67  
    68  	dedupeMap := make(map[Row]struct{}, len(rows))
    69  	for _, row := range rows {
    70  		dedupeMap[row] = struct{}{}
    71  	}
    72  
    73  	dedupeRows := make([]Row, 0, len(dedupeMap))
    74  	for row := range dedupeMap {
    75  		dedupeRows = append(dedupeRows, row)
    76  	}
    77  
    78  	sort.Slice(dedupeRows, func(i, j int) bool {
    79  		return dedupeRows[i].Value < dedupeRows[j].Value
    80  	})
    81  
    82  	buffer1 := parquet.NewRowBuffer[Row]()
    83  	buffer1.Write(rows)
    84  
    85  	buffer1Rows := buffer1.Rows()
    86  	defer buffer1Rows.Close()
    87  
    88  	buffer2 := parquet.NewRowBuffer[Row]()
    89  
    90  	_, err := parquet.CopyRows(
    91  		parquet.DedupeRowWriter(buffer2,
    92  			buffer1.Schema().Comparator(parquet.Ascending("value")),
    93  		),
    94  		buffer1Rows,
    95  	)
    96  	if err != nil {
    97  		t.Fatal(err)
    98  	}
    99  
   100  	reader := parquet.NewGenericRowGroupReader[Row](buffer2)
   101  	defer reader.Close()
   102  
   103  	n, _ := reader.Read(rows)
   104  	assertRowsEqual(t, dedupeRows, rows[:n])
   105  }