github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/dedupe_test.go (about)

     1  //go:build go1.18
     2  
     3  package parquet_test
     4  
     5  import (
     6  	"sort"
     7  	"testing"
     8  
     9  	"github.com/segmentio/parquet-go"
    10  )
    11  
    12  func TestDedupeRowReader(t *testing.T) {
    13  	type Row struct {
    14  		Value int32 `parquet:"value"`
    15  	}
    16  
    17  	rows := make([]Row, 1000)
    18  	for i := range rows {
    19  		rows[i].Value = int32(i / 3)
    20  	}
    21  
    22  	dedupeMap := make(map[Row]struct{}, len(rows))
    23  	for _, row := range rows {
    24  		dedupeMap[row] = struct{}{}
    25  	}
    26  
    27  	dedupeRows := make([]Row, 0, len(dedupeMap))
    28  	for row := range dedupeMap {
    29  		dedupeRows = append(dedupeRows, row)
    30  	}
    31  
    32  	sort.Slice(dedupeRows, func(i, j int) bool {
    33  		return dedupeRows[i].Value < dedupeRows[j].Value
    34  	})
    35  
    36  	buffer1 := parquet.NewRowBuffer[Row]()
    37  	buffer1.Write(rows)
    38  
    39  	buffer1Rows := buffer1.Rows()
    40  	defer buffer1Rows.Close()
    41  
    42  	buffer2 := parquet.NewRowBuffer[Row]()
    43  
    44  	_, err := parquet.CopyRows(buffer2,
    45  		parquet.DedupeRowReader(buffer1Rows,
    46  			buffer1.Schema().Comparator(parquet.Ascending("value")),
    47  		),
    48  	)
    49  	if err != nil {
    50  		t.Fatal(err)
    51  	}
    52  
    53  	reader := parquet.NewGenericRowGroupReader[Row](buffer2)
    54  	defer reader.Close()
    55  
    56  	n, _ := reader.Read(rows)
    57  	assertRowsEqual(t, dedupeRows, rows[:n])
    58  }
    59  
    60  func TestDedupeRowWriter(t *testing.T) {
    61  	type Row struct {
    62  		Value int32 `parquet:"value"`
    63  	}
    64  
    65  	rows := make([]Row, 1000)
    66  	for i := range rows {
    67  		rows[i].Value = int32(i / 3)
    68  	}
    69  
    70  	dedupeMap := make(map[Row]struct{}, len(rows))
    71  	for _, row := range rows {
    72  		dedupeMap[row] = struct{}{}
    73  	}
    74  
    75  	dedupeRows := make([]Row, 0, len(dedupeMap))
    76  	for row := range dedupeMap {
    77  		dedupeRows = append(dedupeRows, row)
    78  	}
    79  
    80  	sort.Slice(dedupeRows, func(i, j int) bool {
    81  		return dedupeRows[i].Value < dedupeRows[j].Value
    82  	})
    83  
    84  	buffer1 := parquet.NewRowBuffer[Row]()
    85  	buffer1.Write(rows)
    86  
    87  	buffer1Rows := buffer1.Rows()
    88  	defer buffer1Rows.Close()
    89  
    90  	buffer2 := parquet.NewRowBuffer[Row]()
    91  
    92  	_, err := parquet.CopyRows(
    93  		parquet.DedupeRowWriter(buffer2,
    94  			buffer1.Schema().Comparator(parquet.Ascending("value")),
    95  		),
    96  		buffer1Rows,
    97  	)
    98  	if err != nil {
    99  		t.Fatal(err)
   100  	}
   101  
   102  	reader := parquet.NewGenericRowGroupReader[Row](buffer2)
   103  	defer reader.Close()
   104  
   105  	n, _ := reader.Read(rows)
   106  	assertRowsEqual(t, dedupeRows, rows[:n])
   107  }