github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/row_group_test.go (about) 1 package parquet_test 2 3 import ( 4 "bytes" 5 "reflect" 6 "sort" 7 "testing" 8 9 "github.com/vc42/parquet-go" 10 ) 11 12 func sortedRowGroup(options []parquet.RowGroupOption, rows ...interface{}) parquet.RowGroup { 13 buf := parquet.NewBuffer(options...) 14 for _, row := range rows { 15 buf.Write(row) 16 } 17 sort.Stable(buf) 18 return buf 19 } 20 21 type Person struct { 22 FirstName utf8string 23 LastName utf8string 24 Age int 25 } 26 27 type LastNameOnly struct { 28 LastName utf8string 29 } 30 31 func newPeopleBuffer(people []Person) parquet.RowGroup { 32 buffer := parquet.NewBuffer() 33 for i := range people { 34 buffer.Write(&people[i]) 35 } 36 return buffer 37 } 38 39 func newPeopleFile(people []Person) parquet.RowGroup { 40 buffer := new(bytes.Buffer) 41 writer := parquet.NewWriter(buffer) 42 for i := range people { 43 writer.Write(&people[i]) 44 } 45 writer.Close() 46 reader := bytes.NewReader(buffer.Bytes()) 47 f, err := parquet.OpenFile(reader, reader.Size()) 48 if err != nil { 49 panic(err) 50 } 51 return f.RowGroups()[0] 52 } 53 54 func TestSeekToRow(t *testing.T) { 55 for _, config := range []struct { 56 name string 57 newRowGroup func([]Person) parquet.RowGroup 58 }{ 59 {name: "buffer", newRowGroup: newPeopleBuffer}, 60 {name: "file", newRowGroup: newPeopleFile}, 61 } { 62 t.Run(config.name, func(t *testing.T) { testSeekToRow(t, config.newRowGroup) }) 63 } 64 } 65 66 func testSeekToRow(t *testing.T, newRowGroup func([]Person) parquet.RowGroup) { 67 err := quickCheck(func(people []Person) bool { 68 if len(people) == 0 { // TODO: fix creation of empty parquet files 69 return true 70 } 71 rowGroup := newRowGroup(people) 72 rows := rowGroup.Rows() 73 rbuf := make([]parquet.Row, 1) 74 pers := Person{} 75 schema := parquet.SchemaOf(&pers) 76 defer rows.Close() 77 78 for i := range people { 79 if err := rows.SeekToRow(int64(i)); err != nil { 80 t.Errorf("seeking to row %d: %+v", i, err) 81 return false 82 } 83 if _, err := rows.ReadRows(rbuf); err != nil { 84 t.Errorf("reading row %d: %+v", i, err) 85 return false 86 } 87 if err := schema.Reconstruct(&pers, rbuf[0]); err != nil { 88 t.Errorf("deconstructing row %d: %+v", i, err) 89 return false 90 } 91 if !reflect.DeepEqual(&pers, &people[i]) { 92 t.Errorf("row %d mismatch", i) 93 return false 94 } 95 } 96 97 return true 98 }) 99 if err != nil { 100 t.Error(err) 101 } 102 } 103 104 func selfRowGroup(rowGroup parquet.RowGroup) parquet.RowGroup { 105 return rowGroup 106 } 107 108 func fileRowGroup(rowGroup parquet.RowGroup) parquet.RowGroup { 109 buffer := new(bytes.Buffer) 110 writer := parquet.NewWriter(buffer) 111 if _, err := writer.WriteRowGroup(rowGroup); err != nil { 112 panic(err) 113 } 114 if err := writer.Close(); err != nil { 115 panic(err) 116 } 117 reader := bytes.NewReader(buffer.Bytes()) 118 f, err := parquet.OpenFile(reader, reader.Size()) 119 if err != nil { 120 panic(err) 121 } 122 return f.RowGroups()[0] 123 } 124 125 func TestMergeRowGroups(t *testing.T) { 126 tests := []struct { 127 scenario string 128 options []parquet.RowGroupOption 129 input []parquet.RowGroup 130 output parquet.RowGroup 131 }{ 132 { 133 scenario: "no row groups", 134 options: []parquet.RowGroupOption{ 135 parquet.SchemaOf(Person{}), 136 }, 137 output: sortedRowGroup( 138 []parquet.RowGroupOption{ 139 parquet.SchemaOf(Person{}), 140 }, 141 ), 142 }, 143 144 { 145 scenario: "a single row group", 146 input: []parquet.RowGroup{ 147 sortedRowGroup(nil, 148 Person{FirstName: "some", LastName: "one", Age: 30}, 149 Person{FirstName: "some", LastName: "one else", Age: 31}, 150 Person{FirstName: "and", LastName: "you", Age: 32}, 151 ), 152 }, 153 output: sortedRowGroup(nil, 154 Person{FirstName: "some", LastName: "one", Age: 30}, 155 Person{FirstName: "some", LastName: "one else", Age: 31}, 156 Person{FirstName: "and", LastName: "you", Age: 32}, 157 ), 158 }, 159 160 { 161 scenario: "two row groups without ordering", 162 input: []parquet.RowGroup{ 163 sortedRowGroup(nil, Person{FirstName: "some", LastName: "one", Age: 30}), 164 sortedRowGroup(nil, Person{FirstName: "some", LastName: "one else", Age: 31}), 165 }, 166 output: sortedRowGroup(nil, 167 Person{FirstName: "some", LastName: "one", Age: 30}, 168 Person{FirstName: "some", LastName: "one else", Age: 31}, 169 ), 170 }, 171 172 { 173 scenario: "three row groups without ordering", 174 input: []parquet.RowGroup{ 175 sortedRowGroup(nil, Person{FirstName: "some", LastName: "one", Age: 30}), 176 sortedRowGroup(nil, Person{FirstName: "some", LastName: "one else", Age: 31}), 177 sortedRowGroup(nil, Person{FirstName: "question", LastName: "answer", Age: 42}), 178 }, 179 output: sortedRowGroup(nil, 180 Person{FirstName: "some", LastName: "one", Age: 30}, 181 Person{FirstName: "some", LastName: "one else", Age: 31}, 182 Person{FirstName: "question", LastName: "answer", Age: 42}, 183 ), 184 }, 185 186 { 187 scenario: "row groups sorted by ascending last name", 188 options: []parquet.RowGroupOption{ 189 parquet.SortingColumns( 190 parquet.Ascending("LastName"), 191 ), 192 }, 193 input: []parquet.RowGroup{ 194 sortedRowGroup( 195 []parquet.RowGroupOption{ 196 parquet.SortingColumns( 197 parquet.Ascending("LastName"), 198 ), 199 }, 200 Person{FirstName: "Han", LastName: "Solo"}, 201 Person{FirstName: "Luke", LastName: "Skywalker"}, 202 ), 203 sortedRowGroup( 204 []parquet.RowGroupOption{ 205 parquet.SortingColumns( 206 parquet.Ascending("LastName"), 207 ), 208 }, 209 Person{FirstName: "Obiwan", LastName: "Kenobi"}, 210 ), 211 }, 212 output: sortedRowGroup(nil, 213 Person{FirstName: "Obiwan", LastName: "Kenobi"}, 214 Person{FirstName: "Luke", LastName: "Skywalker"}, 215 Person{FirstName: "Han", LastName: "Solo"}, 216 ), 217 }, 218 219 { 220 scenario: "row groups sorted by descending last name", 221 options: []parquet.RowGroupOption{ 222 parquet.SortingColumns( 223 parquet.Descending("LastName"), 224 ), 225 }, 226 input: []parquet.RowGroup{ 227 sortedRowGroup( 228 []parquet.RowGroupOption{ 229 parquet.SortingColumns( 230 parquet.Descending("LastName"), 231 ), 232 }, 233 Person{FirstName: "Han", LastName: "Solo"}, 234 Person{FirstName: "Luke", LastName: "Skywalker"}, 235 ), 236 sortedRowGroup( 237 []parquet.RowGroupOption{ 238 parquet.SortingColumns( 239 parquet.Descending("LastName"), 240 ), 241 }, 242 Person{FirstName: "Obiwan", LastName: "Kenobi"}, 243 ), 244 }, 245 output: sortedRowGroup(nil, 246 Person{FirstName: "Han", LastName: "Solo"}, 247 Person{FirstName: "Luke", LastName: "Skywalker"}, 248 Person{FirstName: "Obiwan", LastName: "Kenobi"}, 249 ), 250 }, 251 252 { 253 scenario: "row groups sorted by ascending last and first name", 254 options: []parquet.RowGroupOption{ 255 parquet.SortingColumns( 256 parquet.Ascending("LastName"), 257 parquet.Ascending("FirstName"), 258 ), 259 }, 260 input: []parquet.RowGroup{ 261 sortedRowGroup( 262 []parquet.RowGroupOption{ 263 parquet.SortingColumns( 264 parquet.Ascending("LastName"), 265 parquet.Ascending("FirstName"), 266 ), 267 }, 268 Person{FirstName: "Luke", LastName: "Skywalker"}, 269 Person{FirstName: "Han", LastName: "Solo"}, 270 ), 271 sortedRowGroup( 272 []parquet.RowGroupOption{ 273 parquet.SortingColumns( 274 parquet.Ascending("LastName"), 275 parquet.Ascending("FirstName"), 276 ), 277 }, 278 Person{FirstName: "Obiwan", LastName: "Kenobi"}, 279 Person{FirstName: "Anakin", LastName: "Skywalker"}, 280 ), 281 }, 282 output: sortedRowGroup(nil, 283 Person{FirstName: "Obiwan", LastName: "Kenobi"}, 284 Person{FirstName: "Anakin", LastName: "Skywalker"}, 285 Person{FirstName: "Luke", LastName: "Skywalker"}, 286 Person{FirstName: "Han", LastName: "Solo"}, 287 ), 288 }, 289 290 { 291 scenario: "row groups with conversion to a different schema", 292 options: []parquet.RowGroupOption{ 293 parquet.SchemaOf(LastNameOnly{}), 294 parquet.SortingColumns( 295 parquet.Ascending("LastName"), 296 ), 297 }, 298 input: []parquet.RowGroup{ 299 sortedRowGroup( 300 []parquet.RowGroupOption{ 301 parquet.SortingColumns( 302 parquet.Ascending("LastName"), 303 ), 304 }, 305 Person{FirstName: "Han", LastName: "Solo"}, 306 Person{FirstName: "Luke", LastName: "Skywalker"}, 307 ), 308 sortedRowGroup( 309 []parquet.RowGroupOption{ 310 parquet.SortingColumns( 311 parquet.Ascending("LastName"), 312 ), 313 }, 314 Person{FirstName: "Obiwan", LastName: "Kenobi"}, 315 Person{FirstName: "Anakin", LastName: "Skywalker"}, 316 ), 317 }, 318 output: sortedRowGroup( 319 []parquet.RowGroupOption{ 320 parquet.SortingColumns( 321 parquet.Ascending("LastName"), 322 ), 323 }, 324 LastNameOnly{LastName: "Solo"}, 325 LastNameOnly{LastName: "Skywalker"}, 326 LastNameOnly{LastName: "Skywalker"}, 327 LastNameOnly{LastName: "Kenobi"}, 328 ), 329 }, 330 } 331 332 for _, adapter := range []struct { 333 scenario string 334 function func(parquet.RowGroup) parquet.RowGroup 335 }{ 336 {scenario: "buffer", function: selfRowGroup}, 337 {scenario: "file", function: fileRowGroup}, 338 } { 339 t.Run(adapter.scenario, func(t *testing.T) { 340 for _, test := range tests { 341 t.Run(test.scenario, func(t *testing.T) { 342 input := make([]parquet.RowGroup, len(test.input)) 343 for i := range test.input { 344 input[i] = adapter.function(test.input[i]) 345 } 346 347 merged, err := parquet.MergeRowGroups(test.input, test.options...) 348 if err != nil { 349 t.Fatal(err) 350 } 351 if merged.NumRows() != test.output.NumRows() { 352 t.Fatalf("the number of rows mismatch: want=%d got=%d", merged.NumRows(), test.output.NumRows()) 353 } 354 if merged.Schema() != test.output.Schema() { 355 t.Fatalf("the row group schemas mismatch:\n%v\n%v", test.output.Schema(), merged.Schema()) 356 } 357 358 options := []parquet.RowGroupOption{parquet.SchemaOf(Person{})} 359 options = append(options, test.options...) 360 // We test two views of the resulting row group: the one originally 361 // returned by MergeRowGroups, and one where the merged row group 362 // has been copied into a new buffer. The intent is to exercise both 363 // the row-by-row read as well as optimized code paths when CopyRows 364 // bypasses the ReadRow/WriteRow calls and the row group is written 365 // directly to the buffer by calling WriteRowsTo/WriteRowGroup. 366 mergedCopy := parquet.NewBuffer(options...) 367 368 totalRows := test.output.NumRows() 369 numRows, err := copyRowsAndClose(mergedCopy, merged.Rows()) 370 if err != nil { 371 t.Fatal(err) 372 } 373 if numRows != totalRows { 374 t.Fatalf("wrong number of rows copied: want=%d got=%d", totalRows, numRows) 375 } 376 377 for _, merge := range []struct { 378 scenario string 379 rowGroup parquet.RowGroup 380 }{ 381 {scenario: "self", rowGroup: merged}, 382 {scenario: "copy", rowGroup: mergedCopy}, 383 } { 384 t.Run(merge.scenario, func(t *testing.T) { 385 var expectedRows = test.output.Rows() 386 var mergedRows = merge.rowGroup.Rows() 387 var row1 = make([]parquet.Row, 1) 388 var row2 = make([]parquet.Row, 1) 389 var numRows int64 390 391 defer expectedRows.Close() 392 defer mergedRows.Close() 393 394 for { 395 _, err1 := expectedRows.ReadRows(row1) 396 _, err2 := mergedRows.ReadRows(row2) 397 398 if err1 != err2 { 399 t.Fatalf("errors mismatched while comparing row %d/%d: want=%v got=%v", numRows, totalRows, err1, err2) 400 } 401 402 if err1 != nil { 403 break 404 } 405 406 if !row1[0].Equal(row2[0]) { 407 t.Errorf("row at index %d/%d mismatch: want=%+v got=%+v", numRows, totalRows, row1[0], row2[0]) 408 } 409 410 numRows++ 411 } 412 413 if numRows != totalRows { 414 t.Errorf("expected to read %d rows but %d were found", totalRows, numRows) 415 } 416 }) 417 } 418 419 }) 420 } 421 }) 422 } 423 }