github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/sorting_test.go (about) 1 package parquet_test 2 3 import ( 4 "bytes" 5 "encoding/binary" 6 "math/rand" 7 "os" 8 "reflect" 9 "sort" 10 "testing" 11 "time" 12 13 "github.com/parquet-go/parquet-go" 14 ) 15 16 func TestSortingWriter(t *testing.T) { 17 type Row struct { 18 Value int32 `parquet:"value"` 19 } 20 21 rows := make([]Row, 1000) 22 for i := range rows { 23 rows[i].Value = int32(i) 24 } 25 26 prng := rand.New(rand.NewSource(0)) 27 prng.Shuffle(len(rows), func(i, j int) { 28 rows[i], rows[j] = rows[j], rows[i] 29 }) 30 31 buffer := bytes.NewBuffer(nil) 32 writer := parquet.NewSortingWriter[Row](buffer, 99, 33 parquet.SortingWriterConfig( 34 parquet.SortingColumns( 35 parquet.Ascending("value"), 36 ), 37 ), 38 ) 39 40 _, err := writer.Write(rows) 41 if err != nil { 42 t.Fatal(err) 43 } 44 45 if err := writer.Close(); err != nil { 46 t.Fatal(err) 47 } 48 49 read, err := parquet.Read[Row](bytes.NewReader(buffer.Bytes()), int64(buffer.Len())) 50 if err != nil { 51 t.Fatal(err) 52 } 53 54 sort.Slice(rows, func(i, j int) bool { 55 return rows[i].Value < rows[j].Value 56 }) 57 58 assertRowsEqual(t, rows, read) 59 } 60 61 func TestSortingWriterDropDuplicatedRows(t *testing.T) { 62 type Row struct { 63 Value int32 `parquet:"value"` 64 } 65 66 rows := make([]Row, 1000) 67 for i := range rows { 68 rows[i].Value = int32(i / 2) 69 } 70 71 prng := rand.New(rand.NewSource(0)) 72 prng.Shuffle(len(rows), func(i, j int) { 73 rows[i], rows[j] = rows[j], rows[i] 74 }) 75 76 buffer := bytes.NewBuffer(nil) 77 writer := parquet.NewSortingWriter[Row](buffer, 99, 78 parquet.SortingWriterConfig( 79 parquet.SortingBuffers( 80 parquet.NewFileBufferPool("", "buffers.*"), 81 ), 82 parquet.SortingColumns( 83 parquet.Ascending("value"), 84 ), 85 parquet.DropDuplicatedRows(true), 86 ), 87 ) 88 89 _, err := writer.Write(rows) 90 if err != nil { 91 t.Fatal(err) 92 } 93 94 if err := writer.Close(); err != nil { 95 t.Fatal(err) 96 } 97 98 read, err := parquet.Read[Row](bytes.NewReader(buffer.Bytes()), int64(buffer.Len())) 99 if err != nil { 100 t.Fatal(err) 101 } 102 103 sort.Slice(rows, func(i, j int) bool { 104 return rows[i].Value < rows[j].Value 105 }) 106 107 n := len(rows) / 2 108 for i := range rows[:n] { 109 rows[i] = rows[2*i] 110 } 111 112 assertRowsEqual(t, rows[:n], read) 113 } 114 115 func TestSortingWriterCorruptedString(t *testing.T) { 116 type Row struct { 117 Tag string `parquet:"tag"` 118 } 119 rowsWant := make([]Row, 107) // passes at 106, but fails at 107+ 120 for i := range rowsWant { 121 rowsWant[i].Tag = randString(100) 122 } 123 124 buffer := bytes.NewBuffer(nil) 125 126 writer := parquet.NewSortingWriter[Row](buffer, 2000, 127 &parquet.WriterConfig{ 128 PageBufferSize: 2560, 129 Sorting: parquet.SortingConfig{ 130 SortingColumns: []parquet.SortingColumn{ 131 parquet.Ascending("tag"), 132 }, 133 }, 134 }) 135 136 _, err := writer.Write(rowsWant) 137 if err != nil { 138 t.Fatal(err) 139 } 140 141 if err := writer.Close(); err != nil { 142 t.Fatal(err) 143 } 144 145 rowsGot, err := parquet.Read[Row](bytes.NewReader(buffer.Bytes()), int64(buffer.Len())) 146 if err != nil { 147 t.Fatal(err) 148 } 149 150 sort.Slice(rowsWant, func(i, j int) bool { 151 return rowsWant[i].Tag < rowsWant[j].Tag 152 }) 153 154 assertRowsEqualByRow(t, rowsGot, rowsWant) 155 } 156 157 func TestSortingWriterCorruptedFixedLenByteArray(t *testing.T) { 158 type Row struct { 159 ID [16]byte `parquet:"id,uuid"` 160 } 161 rowsWant := make([]Row, 700) // passes at 300, fails at 400+. 162 for i := range rowsWant { 163 rowsWant[i].ID = rand16bytes() 164 } 165 166 buffer := bytes.NewBuffer(nil) 167 168 writer := parquet.NewSortingWriter[Row](buffer, 2000, 169 &parquet.WriterConfig{ 170 PageBufferSize: 2560, 171 Sorting: parquet.SortingConfig{ 172 SortingColumns: []parquet.SortingColumn{ 173 parquet.Ascending("id"), 174 }, 175 }, 176 }) 177 178 _, err := writer.Write(rowsWant) 179 if err != nil { 180 t.Fatal(err) 181 } 182 183 if err := writer.Close(); err != nil { 184 t.Fatal(err) 185 } 186 187 rowsGot, err := parquet.Read[Row](bytes.NewReader(buffer.Bytes()), int64(buffer.Len())) 188 if err != nil { 189 t.Fatal(err) 190 } 191 192 sort.Slice(rowsWant, func(i, j int) bool { 193 return idLess(rowsWant[i].ID, rowsWant[j].ID) 194 }) 195 196 assertRowsEqualByRow(t, rowsGot, rowsWant) 197 } 198 199 const letterRunes = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" 200 201 func randString(n int) string { 202 b := make([]byte, n) 203 for i := range b { 204 b[i] = letterRunes[rand.New(rand.NewSource(time.Now().UnixNano())).Intn(len(letterRunes))] 205 } 206 return string(b) 207 } 208 209 func rand16bytes() [16]byte { 210 var b [16]byte 211 for i := range b { 212 b[i] = letterRunes[rand.Intn(len(letterRunes))] 213 } 214 return b 215 } 216 217 func idLess(ID1, ID2 [16]byte) bool { 218 k1 := binary.BigEndian.Uint64(ID1[:8]) 219 k2 := binary.BigEndian.Uint64(ID2[:8]) 220 switch { 221 case k1 < k2: 222 return true 223 case k1 > k2: 224 return false 225 } 226 k1 = binary.BigEndian.Uint64(ID1[8:]) 227 k2 = binary.BigEndian.Uint64(ID2[8:]) 228 return k1 < k2 229 } 230 231 func assertRowsEqualByRow[T any](t *testing.T, rowsGot, rowsWant []T) { 232 if len(rowsGot) != len(rowsWant) { 233 t.Errorf("want rows length %d but got rows length %d", len(rowsWant), len(rowsGot)) 234 } 235 count := 0 236 for i := range rowsGot { 237 if !reflect.DeepEqual(rowsGot[i], rowsWant[i]) { 238 t.Error("rows mismatch at index", i, ":") 239 t.Logf(" want: %#v\n", rowsWant[i]) 240 t.Logf(" got: %#v\n", rowsGot[i]) 241 242 // check if rowsGot[i] is even present in rowsWant 243 found := false 244 for j := range rowsWant { 245 if reflect.DeepEqual(rowsWant[j], rowsGot[i]) { 246 t.Log(" we found the row at index", j, "in want.") 247 found = true 248 break 249 } 250 } 251 if !found { 252 t.Log(" got row index", i, "isn't found in want rows, and is therefore corrupted data.") 253 } 254 count++ 255 } 256 } 257 if count > 0 { 258 t.Error(count, "rows mismatched out of", len(rowsWant), "total") 259 } 260 } 261 262 func TestIssue_82(t *testing.T) { 263 type Record struct { 264 A string `parquet:"a"` 265 } 266 267 fi, err := os.Open("testdata/lz4_raw_compressed_larger.parquet") 268 if err != nil { 269 t.Fatal(err) 270 } 271 defer fi.Close() 272 273 stat, err := fi.Stat() 274 if err != nil { 275 t.Fatal(err) 276 } 277 278 fl, err := parquet.OpenFile(fi, stat.Size()) 279 if err != nil { 280 t.Fatal(err) 281 } 282 groups := fl.RowGroups() 283 if expect, got := 1, len(groups); expect != got { 284 t.Fatalf("expected %d row groups got %d", expect, got) 285 } 286 287 fr := parquet.NewRowGroupReader(groups[0]) 288 289 var out bytes.Buffer 290 291 pw := parquet.NewSortingWriter[Record]( 292 &out, 293 1000, 294 parquet.SortingWriterConfig( 295 parquet.SortingColumns(parquet.Ascending("a")), 296 ), 297 ) 298 299 if _, err := parquet.CopyRows(pw, fr); err != nil { 300 t.Fatal(err) 301 } 302 303 if err := pw.Close(); err != nil { 304 t.Fatal(err) 305 } 306 rowsWant, err := parquet.Read[Record](fl, stat.Size()) 307 if err != nil { 308 t.Fatal(err) 309 } 310 rowsGot, err := parquet.Read[Record](bytes.NewReader(out.Bytes()), int64(out.Len())) 311 if err != nil { 312 t.Fatal(err) 313 } 314 sort.Slice(rowsWant, func(i, j int) bool { 315 return rowsWant[i].A < rowsWant[j].A 316 }) 317 assertRowsEqualByRow(t, rowsGot, rowsWant) 318 }