github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/sorting_test.go (about)

     1  package parquet_test
     2  
     3  import (
     4  	"bytes"
     5  	"encoding/binary"
     6  	"math/rand"
     7  	"os"
     8  	"reflect"
     9  	"sort"
    10  	"testing"
    11  	"time"
    12  
    13  	"github.com/parquet-go/parquet-go"
    14  )
    15  
    16  func TestSortingWriter(t *testing.T) {
    17  	type Row struct {
    18  		Value int32 `parquet:"value"`
    19  	}
    20  
    21  	rows := make([]Row, 1000)
    22  	for i := range rows {
    23  		rows[i].Value = int32(i)
    24  	}
    25  
    26  	prng := rand.New(rand.NewSource(0))
    27  	prng.Shuffle(len(rows), func(i, j int) {
    28  		rows[i], rows[j] = rows[j], rows[i]
    29  	})
    30  
    31  	buffer := bytes.NewBuffer(nil)
    32  	writer := parquet.NewSortingWriter[Row](buffer, 99,
    33  		parquet.SortingWriterConfig(
    34  			parquet.SortingColumns(
    35  				parquet.Ascending("value"),
    36  			),
    37  		),
    38  	)
    39  
    40  	_, err := writer.Write(rows)
    41  	if err != nil {
    42  		t.Fatal(err)
    43  	}
    44  
    45  	if err := writer.Close(); err != nil {
    46  		t.Fatal(err)
    47  	}
    48  
    49  	read, err := parquet.Read[Row](bytes.NewReader(buffer.Bytes()), int64(buffer.Len()))
    50  	if err != nil {
    51  		t.Fatal(err)
    52  	}
    53  
    54  	sort.Slice(rows, func(i, j int) bool {
    55  		return rows[i].Value < rows[j].Value
    56  	})
    57  
    58  	assertRowsEqual(t, rows, read)
    59  }
    60  
    61  func TestSortingWriterDropDuplicatedRows(t *testing.T) {
    62  	type Row struct {
    63  		Value int32 `parquet:"value"`
    64  	}
    65  
    66  	rows := make([]Row, 1000)
    67  	for i := range rows {
    68  		rows[i].Value = int32(i / 2)
    69  	}
    70  
    71  	prng := rand.New(rand.NewSource(0))
    72  	prng.Shuffle(len(rows), func(i, j int) {
    73  		rows[i], rows[j] = rows[j], rows[i]
    74  	})
    75  
    76  	buffer := bytes.NewBuffer(nil)
    77  	writer := parquet.NewSortingWriter[Row](buffer, 99,
    78  		parquet.SortingWriterConfig(
    79  			parquet.SortingBuffers(
    80  				parquet.NewFileBufferPool("", "buffers.*"),
    81  			),
    82  			parquet.SortingColumns(
    83  				parquet.Ascending("value"),
    84  			),
    85  			parquet.DropDuplicatedRows(true),
    86  		),
    87  	)
    88  
    89  	_, err := writer.Write(rows)
    90  	if err != nil {
    91  		t.Fatal(err)
    92  	}
    93  
    94  	if err := writer.Close(); err != nil {
    95  		t.Fatal(err)
    96  	}
    97  
    98  	read, err := parquet.Read[Row](bytes.NewReader(buffer.Bytes()), int64(buffer.Len()))
    99  	if err != nil {
   100  		t.Fatal(err)
   101  	}
   102  
   103  	sort.Slice(rows, func(i, j int) bool {
   104  		return rows[i].Value < rows[j].Value
   105  	})
   106  
   107  	n := len(rows) / 2
   108  	for i := range rows[:n] {
   109  		rows[i] = rows[2*i]
   110  	}
   111  
   112  	assertRowsEqual(t, rows[:n], read)
   113  }
   114  
   115  func TestSortingWriterCorruptedString(t *testing.T) {
   116  	type Row struct {
   117  		Tag string `parquet:"tag"`
   118  	}
   119  	rowsWant := make([]Row, 107) // passes at 106, but fails at 107+
   120  	for i := range rowsWant {
   121  		rowsWant[i].Tag = randString(100)
   122  	}
   123  
   124  	buffer := bytes.NewBuffer(nil)
   125  
   126  	writer := parquet.NewSortingWriter[Row](buffer, 2000,
   127  		&parquet.WriterConfig{
   128  			PageBufferSize: 2560,
   129  			Sorting: parquet.SortingConfig{
   130  				SortingColumns: []parquet.SortingColumn{
   131  					parquet.Ascending("tag"),
   132  				},
   133  			},
   134  		})
   135  
   136  	_, err := writer.Write(rowsWant)
   137  	if err != nil {
   138  		t.Fatal(err)
   139  	}
   140  
   141  	if err := writer.Close(); err != nil {
   142  		t.Fatal(err)
   143  	}
   144  
   145  	rowsGot, err := parquet.Read[Row](bytes.NewReader(buffer.Bytes()), int64(buffer.Len()))
   146  	if err != nil {
   147  		t.Fatal(err)
   148  	}
   149  
   150  	sort.Slice(rowsWant, func(i, j int) bool {
   151  		return rowsWant[i].Tag < rowsWant[j].Tag
   152  	})
   153  
   154  	assertRowsEqualByRow(t, rowsGot, rowsWant)
   155  }
   156  
   157  func TestSortingWriterCorruptedFixedLenByteArray(t *testing.T) {
   158  	type Row struct {
   159  		ID [16]byte `parquet:"id,uuid"`
   160  	}
   161  	rowsWant := make([]Row, 700) // passes at 300, fails at 400+.
   162  	for i := range rowsWant {
   163  		rowsWant[i].ID = rand16bytes()
   164  	}
   165  
   166  	buffer := bytes.NewBuffer(nil)
   167  
   168  	writer := parquet.NewSortingWriter[Row](buffer, 2000,
   169  		&parquet.WriterConfig{
   170  			PageBufferSize: 2560,
   171  			Sorting: parquet.SortingConfig{
   172  				SortingColumns: []parquet.SortingColumn{
   173  					parquet.Ascending("id"),
   174  				},
   175  			},
   176  		})
   177  
   178  	_, err := writer.Write(rowsWant)
   179  	if err != nil {
   180  		t.Fatal(err)
   181  	}
   182  
   183  	if err := writer.Close(); err != nil {
   184  		t.Fatal(err)
   185  	}
   186  
   187  	rowsGot, err := parquet.Read[Row](bytes.NewReader(buffer.Bytes()), int64(buffer.Len()))
   188  	if err != nil {
   189  		t.Fatal(err)
   190  	}
   191  
   192  	sort.Slice(rowsWant, func(i, j int) bool {
   193  		return idLess(rowsWant[i].ID, rowsWant[j].ID)
   194  	})
   195  
   196  	assertRowsEqualByRow(t, rowsGot, rowsWant)
   197  }
   198  
   199  const letterRunes = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
   200  
   201  func randString(n int) string {
   202  	b := make([]byte, n)
   203  	for i := range b {
   204  		b[i] = letterRunes[rand.New(rand.NewSource(time.Now().UnixNano())).Intn(len(letterRunes))]
   205  	}
   206  	return string(b)
   207  }
   208  
   209  func rand16bytes() [16]byte {
   210  	var b [16]byte
   211  	for i := range b {
   212  		b[i] = letterRunes[rand.Intn(len(letterRunes))]
   213  	}
   214  	return b
   215  }
   216  
   217  func idLess(ID1, ID2 [16]byte) bool {
   218  	k1 := binary.BigEndian.Uint64(ID1[:8])
   219  	k2 := binary.BigEndian.Uint64(ID2[:8])
   220  	switch {
   221  	case k1 < k2:
   222  		return true
   223  	case k1 > k2:
   224  		return false
   225  	}
   226  	k1 = binary.BigEndian.Uint64(ID1[8:])
   227  	k2 = binary.BigEndian.Uint64(ID2[8:])
   228  	return k1 < k2
   229  }
   230  
   231  func assertRowsEqualByRow[T any](t *testing.T, rowsGot, rowsWant []T) {
   232  	if len(rowsGot) != len(rowsWant) {
   233  		t.Errorf("want rows length %d but got rows length %d", len(rowsWant), len(rowsGot))
   234  	}
   235  	count := 0
   236  	for i := range rowsGot {
   237  		if !reflect.DeepEqual(rowsGot[i], rowsWant[i]) {
   238  			t.Error("rows mismatch at index", i, ":")
   239  			t.Logf(" want: %#v\n", rowsWant[i])
   240  			t.Logf("  got: %#v\n", rowsGot[i])
   241  
   242  			// check if rowsGot[i] is even present in rowsWant
   243  			found := false
   244  			for j := range rowsWant {
   245  				if reflect.DeepEqual(rowsWant[j], rowsGot[i]) {
   246  					t.Log("  we found the row at index", j, "in want.")
   247  					found = true
   248  					break
   249  				}
   250  			}
   251  			if !found {
   252  				t.Log("  got row index", i, "isn't found in want rows, and is therefore corrupted data.")
   253  			}
   254  			count++
   255  		}
   256  	}
   257  	if count > 0 {
   258  		t.Error(count, "rows mismatched out of", len(rowsWant), "total")
   259  	}
   260  }
   261  
   262  func TestIssue_82(t *testing.T) {
   263  	type Record struct {
   264  		A string `parquet:"a"`
   265  	}
   266  
   267  	fi, err := os.Open("testdata/lz4_raw_compressed_larger.parquet")
   268  	if err != nil {
   269  		t.Fatal(err)
   270  	}
   271  	defer fi.Close()
   272  
   273  	stat, err := fi.Stat()
   274  	if err != nil {
   275  		t.Fatal(err)
   276  	}
   277  
   278  	fl, err := parquet.OpenFile(fi, stat.Size())
   279  	if err != nil {
   280  		t.Fatal(err)
   281  	}
   282  	groups := fl.RowGroups()
   283  	if expect, got := 1, len(groups); expect != got {
   284  		t.Fatalf("expected %d row groups got %d", expect, got)
   285  	}
   286  
   287  	fr := parquet.NewRowGroupReader(groups[0])
   288  
   289  	var out bytes.Buffer
   290  
   291  	pw := parquet.NewSortingWriter[Record](
   292  		&out,
   293  		1000,
   294  		parquet.SortingWriterConfig(
   295  			parquet.SortingColumns(parquet.Ascending("a")),
   296  		),
   297  	)
   298  
   299  	if _, err := parquet.CopyRows(pw, fr); err != nil {
   300  		t.Fatal(err)
   301  	}
   302  
   303  	if err := pw.Close(); err != nil {
   304  		t.Fatal(err)
   305  	}
   306  	rowsWant, err := parquet.Read[Record](fl, stat.Size())
   307  	if err != nil {
   308  		t.Fatal(err)
   309  	}
   310  	rowsGot, err := parquet.Read[Record](bytes.NewReader(out.Bytes()), int64(out.Len()))
   311  	if err != nil {
   312  		t.Fatal(err)
   313  	}
   314  	sort.Slice(rowsWant, func(i, j int) bool {
   315  		return rowsWant[i].A < rowsWant[j].A
   316  	})
   317  	assertRowsEqualByRow(t, rowsGot, rowsWant)
   318  }