github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/sorting_test.go (about)

     1  //go:build go1.18
     2  
     3  package parquet_test
     4  
     5  import (
     6  	"bytes"
     7  	"math/rand"
     8  	"sort"
     9  	"testing"
    10  
    11  	"github.com/segmentio/parquet-go"
    12  )
    13  
    14  func TestSortingWriter(t *testing.T) {
    15  	type Row struct {
    16  		Value int32 `parquet:"value"`
    17  	}
    18  
    19  	rows := make([]Row, 1000)
    20  	for i := range rows {
    21  		rows[i].Value = int32(i)
    22  	}
    23  
    24  	prng := rand.New(rand.NewSource(0))
    25  	prng.Shuffle(len(rows), func(i, j int) {
    26  		rows[i], rows[j] = rows[j], rows[i]
    27  	})
    28  
    29  	buffer := bytes.NewBuffer(nil)
    30  	writer := parquet.NewSortingWriter[Row](buffer, 99,
    31  		parquet.SortingWriterConfig(
    32  			parquet.SortingColumns(
    33  				parquet.Ascending("value"),
    34  			),
    35  		),
    36  	)
    37  
    38  	_, err := writer.Write(rows)
    39  	if err != nil {
    40  		t.Fatal(err)
    41  	}
    42  
    43  	if err := writer.Close(); err != nil {
    44  		t.Fatal(err)
    45  	}
    46  
    47  	read, err := parquet.Read[Row](bytes.NewReader(buffer.Bytes()), int64(buffer.Len()))
    48  	if err != nil {
    49  		t.Fatal(err)
    50  	}
    51  
    52  	sort.Slice(rows, func(i, j int) bool {
    53  		return rows[i].Value < rows[j].Value
    54  	})
    55  
    56  	assertRowsEqual(t, rows, read)
    57  }
    58  
    59  func TestSortingWriterDropDuplicatedRows(t *testing.T) {
    60  	type Row struct {
    61  		Value int32 `parquet:"value"`
    62  	}
    63  
    64  	rows := make([]Row, 1000)
    65  	for i := range rows {
    66  		rows[i].Value = int32(i / 2)
    67  	}
    68  
    69  	prng := rand.New(rand.NewSource(0))
    70  	prng.Shuffle(len(rows), func(i, j int) {
    71  		rows[i], rows[j] = rows[j], rows[i]
    72  	})
    73  
    74  	buffer := bytes.NewBuffer(nil)
    75  	writer := parquet.NewSortingWriter[Row](buffer, 99,
    76  		parquet.SortingWriterConfig(
    77  			parquet.SortingBuffers(
    78  				parquet.NewFileBufferPool("", "buffers.*"),
    79  			),
    80  			parquet.SortingColumns(
    81  				parquet.Ascending("value"),
    82  			),
    83  			parquet.DropDuplicatedRows(true),
    84  		),
    85  	)
    86  
    87  	_, err := writer.Write(rows)
    88  	if err != nil {
    89  		t.Fatal(err)
    90  	}
    91  
    92  	if err := writer.Close(); err != nil {
    93  		t.Fatal(err)
    94  	}
    95  
    96  	read, err := parquet.Read[Row](bytes.NewReader(buffer.Bytes()), int64(buffer.Len()))
    97  	if err != nil {
    98  		t.Fatal(err)
    99  	}
   100  
   101  	sort.Slice(rows, func(i, j int) bool {
   102  		return rows[i].Value < rows[j].Value
   103  	})
   104  
   105  	n := len(rows) / 2
   106  	for i := range rows[:n] {
   107  		rows[i] = rows[2*i]
   108  	}
   109  
   110  	assertRowsEqual(t, rows[:n], read)
   111  }