github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/page_test.go (about) 1 package parquet_test 2 3 import ( 4 "bytes" 5 "io" 6 "reflect" 7 "testing" 8 9 "github.com/vc42/parquet-go" 10 "github.com/vc42/parquet-go/deprecated" 11 "github.com/vc42/parquet-go/encoding/plain" 12 "github.com/vc42/parquet-go/internal/unsafecast" 13 ) 14 15 func TestPage(t *testing.T) { 16 t.Run("BOOLEAN", testPageBoolean) 17 t.Run("INT32", testPageInt32) 18 t.Run("INT64", testPageInt64) 19 t.Run("INT96", testPageInt96) 20 t.Run("FLOAT", testPageFloat) 21 t.Run("DOUBLE", testPageDouble) 22 t.Run("BYTE_ARRAY", testPageByteArray) 23 t.Run("FIXED_LEN_BYTE_ARRAY", testPageFixedLenByteArray) 24 } 25 26 func testPageBoolean(t *testing.T) { 27 schema := parquet.SchemaOf(struct{ Value bool }{}) 28 29 t.Run("parquet", func(t *testing.T) { 30 testPage(t, schema, pageTest{ 31 write: func(w parquet.ValueWriter) (interface{}, error) { 32 values := []bool{false, true} 33 n, err := w.(parquet.BooleanWriter).WriteBooleans(values) 34 return values[:n], err 35 }, 36 37 read: func(r parquet.ValueReader) (interface{}, error) { 38 values := make([]bool, 2) 39 n, err := r.(parquet.BooleanReader).ReadBooleans(values) 40 return values[:n], err 41 }, 42 }) 43 }) 44 } 45 46 func testPageInt32(t *testing.T) { 47 schema := parquet.SchemaOf(struct{ Value int32 }{}) 48 49 t.Run("io", func(t *testing.T) { 50 testBufferPage(t, schema, pageTest{ 51 write: func(w parquet.ValueWriter) (interface{}, error) { 52 values := []int32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} 53 n, err := w.(io.Writer).Write(unsafecast.Int32ToBytes(values)) 54 return values[:n/4], err 55 }, 56 57 read: func(r parquet.ValueReader) (interface{}, error) { 58 values := make([]int32, 10) 59 n, err := r.(io.Reader).Read(unsafecast.Int32ToBytes(values)) 60 return values[:n/4], err 61 }, 62 }) 63 }) 64 65 t.Run("parquet", func(t *testing.T) { 66 testPage(t, schema, pageTest{ 67 write: func(w parquet.ValueWriter) (interface{}, error) { 68 values := []int32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} 69 n, err := w.(parquet.Int32Writer).WriteInt32s(values) 70 return values[:n], err 71 }, 72 73 read: func(r parquet.ValueReader) (interface{}, error) { 74 values := make([]int32, 10) 75 n, err := r.(parquet.Int32Reader).ReadInt32s(values) 76 return values[:n], err 77 }, 78 }) 79 }) 80 } 81 82 func testPageInt64(t *testing.T) { 83 schema := parquet.SchemaOf(struct{ Value int64 }{}) 84 85 t.Run("io", func(t *testing.T) { 86 testBufferPage(t, schema, pageTest{ 87 write: func(w parquet.ValueWriter) (interface{}, error) { 88 values := []int64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} 89 n, err := w.(io.Writer).Write(unsafecast.Int64ToBytes(values)) 90 return values[:n/8], err 91 }, 92 93 read: func(r parquet.ValueReader) (interface{}, error) { 94 values := make([]int64, 10) 95 n, err := r.(io.Reader).Read(unsafecast.Int64ToBytes(values)) 96 return values[:n/8], err 97 }, 98 }) 99 }) 100 101 t.Run("parquet", func(t *testing.T) { 102 testPage(t, schema, pageTest{ 103 write: func(w parquet.ValueWriter) (interface{}, error) { 104 values := []int64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} 105 n, err := w.(parquet.Int64Writer).WriteInt64s(values) 106 return values[:n], err 107 }, 108 109 read: func(r parquet.ValueReader) (interface{}, error) { 110 values := make([]int64, 10) 111 n, err := r.(parquet.Int64Reader).ReadInt64s(values) 112 return values[:n], err 113 }, 114 }) 115 }) 116 } 117 118 func testPageInt96(t *testing.T) { 119 schema := parquet.SchemaOf(struct{ Value deprecated.Int96 }{}) 120 121 t.Run("io", func(t *testing.T) { 122 testBufferPage(t, schema, pageTest{ 123 write: func(w parquet.ValueWriter) (interface{}, error) { 124 values := []deprecated.Int96{{0: 0}, {0: 1}, {0: 2}} 125 n, err := w.(io.Writer).Write(deprecated.Int96ToBytes(values)) 126 return values[:n/12], err 127 }, 128 129 read: func(r parquet.ValueReader) (interface{}, error) { 130 values := make([]deprecated.Int96, 3) 131 n, err := r.(io.Reader).Read(deprecated.Int96ToBytes(values)) 132 return values[:n/12], err 133 }, 134 }) 135 }) 136 137 t.Run("parquet", func(t *testing.T) { 138 testPage(t, schema, pageTest{ 139 write: func(w parquet.ValueWriter) (interface{}, error) { 140 values := []deprecated.Int96{{0: 0}, {0: 1}, {0: 2}} 141 n, err := w.(parquet.Int96Writer).WriteInt96s(values) 142 return values[:n], err 143 }, 144 145 read: func(r parquet.ValueReader) (interface{}, error) { 146 values := make([]deprecated.Int96, 3) 147 n, err := r.(parquet.Int96Reader).ReadInt96s(values) 148 return values[:n], err 149 }, 150 }) 151 }) 152 } 153 154 func testPageFloat(t *testing.T) { 155 schema := parquet.SchemaOf(struct{ Value float32 }{}) 156 157 t.Run("io", func(t *testing.T) { 158 testBufferPage(t, schema, pageTest{ 159 write: func(w parquet.ValueWriter) (interface{}, error) { 160 values := []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} 161 n, err := w.(io.Writer).Write(unsafecast.Float32ToBytes(values)) 162 return values[:n/4], err 163 }, 164 165 read: func(r parquet.ValueReader) (interface{}, error) { 166 values := make([]float32, 10) 167 n, err := r.(io.Reader).Read(unsafecast.Float32ToBytes(values)) 168 return values[:n/4], err 169 }, 170 }) 171 }) 172 173 t.Run("parquet", func(t *testing.T) { 174 testPage(t, schema, pageTest{ 175 write: func(w parquet.ValueWriter) (interface{}, error) { 176 values := []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} 177 n, err := w.(parquet.FloatWriter).WriteFloats(values) 178 return values[:n], err 179 }, 180 181 read: func(r parquet.ValueReader) (interface{}, error) { 182 values := make([]float32, 10) 183 n, err := r.(parquet.FloatReader).ReadFloats(values) 184 return values[:n], err 185 }, 186 }) 187 }) 188 } 189 190 func testPageDouble(t *testing.T) { 191 schema := parquet.SchemaOf(struct{ Value float64 }{}) 192 193 t.Run("io", func(t *testing.T) { 194 testBufferPage(t, schema, pageTest{ 195 write: func(w parquet.ValueWriter) (interface{}, error) { 196 values := []float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} 197 n, err := w.(io.Writer).Write(unsafecast.Float64ToBytes(values)) 198 return values[:n/8], err 199 }, 200 201 read: func(r parquet.ValueReader) (interface{}, error) { 202 values := make([]float64, 10) 203 n, err := r.(io.Reader).Read(unsafecast.Float64ToBytes(values)) 204 return values[:n/8], err 205 }, 206 }) 207 }) 208 209 t.Run("parquet", func(t *testing.T) { 210 testPage(t, schema, pageTest{ 211 write: func(w parquet.ValueWriter) (interface{}, error) { 212 values := []float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} 213 n, err := w.(parquet.DoubleWriter).WriteDoubles(values) 214 return values[:n], err 215 }, 216 217 read: func(r parquet.ValueReader) (interface{}, error) { 218 values := make([]float64, 10) 219 n, err := r.(parquet.DoubleReader).ReadDoubles(values) 220 return values[:n], err 221 }, 222 }) 223 }) 224 } 225 226 func testPageByteArray(t *testing.T) { 227 schema := parquet.SchemaOf(struct{ Value []byte }{}) 228 229 t.Run("io", func(t *testing.T) { 230 testBufferPage(t, schema, pageTest{ 231 write: func(w parquet.ValueWriter) (interface{}, error) { 232 values := []byte{} 233 values = plain.AppendByteArray(values, []byte("A")) 234 values = plain.AppendByteArray(values, []byte("B")) 235 values = plain.AppendByteArray(values, []byte("C")) 236 n, err := w.(io.Writer).Write(values) 237 return values[:n], err 238 }, 239 240 read: func(r parquet.ValueReader) (interface{}, error) { 241 values := make([]byte, 3+3*plain.ByteArrayLengthSize) 242 n, err := r.(io.Reader).Read(values) 243 return values[:n], err 244 }, 245 }) 246 }) 247 248 t.Run("parquet", func(t *testing.T) { 249 testPage(t, schema, pageTest{ 250 write: func(w parquet.ValueWriter) (interface{}, error) { 251 values := []byte{} 252 values = plain.AppendByteArray(values, []byte("A")) 253 values = plain.AppendByteArray(values, []byte("B")) 254 values = plain.AppendByteArray(values, []byte("C")) 255 _, err := w.(parquet.ByteArrayWriter).WriteByteArrays(values) 256 return values, err 257 }, 258 259 read: func(r parquet.ValueReader) (interface{}, error) { 260 values := make([]byte, 3+3*plain.ByteArrayLengthSize) 261 n, err := r.(parquet.ByteArrayReader).ReadByteArrays(values) 262 return values[:n+n*plain.ByteArrayLengthSize], err 263 }, 264 }) 265 }) 266 } 267 268 func testPageFixedLenByteArray(t *testing.T) { 269 schema := parquet.SchemaOf(struct{ Value [3]byte }{}) 270 271 t.Run("io", func(t *testing.T) { 272 testBufferPage(t, schema, pageTest{ 273 write: func(w parquet.ValueWriter) (interface{}, error) { 274 values := []byte("123456789") 275 n, err := w.(io.Writer).Write(values) 276 return values[:n], err 277 }, 278 279 read: func(r parquet.ValueReader) (interface{}, error) { 280 values := make([]byte, 3*3) 281 n, err := r.(io.Reader).Read(values) 282 return values[:n], err 283 }, 284 }) 285 }) 286 287 t.Run("parquet", func(t *testing.T) { 288 testPage(t, schema, pageTest{ 289 write: func(w parquet.ValueWriter) (interface{}, error) { 290 values := []byte("123456789") 291 n, err := w.(parquet.FixedLenByteArrayWriter).WriteFixedLenByteArrays(values) 292 return values[:3*n], err 293 }, 294 295 read: func(r parquet.ValueReader) (interface{}, error) { 296 values := make([]byte, 3*3) 297 n, err := r.(parquet.FixedLenByteArrayReader).ReadFixedLenByteArrays(values) 298 return values[:3*n], err 299 }, 300 }) 301 }) 302 } 303 304 type pageTest struct { 305 write func(parquet.ValueWriter) (interface{}, error) 306 read func(parquet.ValueReader) (interface{}, error) 307 } 308 309 func testPage(t *testing.T, schema *parquet.Schema, test pageTest) { 310 t.Run("buffer", func(t *testing.T) { testBufferPage(t, schema, test) }) 311 t.Run("file", func(t *testing.T) { testFilePage(t, schema, test) }) 312 } 313 314 func testBufferPage(t *testing.T, schema *parquet.Schema, test pageTest) { 315 buffer := parquet.NewBuffer(schema) 316 column := buffer.ColumnBuffers()[0] 317 318 w, err := test.write(column) 319 if err != nil { 320 t.Fatal("writing page values:", err) 321 } 322 323 r, err := test.read(column.Page().Values()) 324 if err != io.EOF { 325 t.Errorf("expected io.EOF after reading all values but got %v", err) 326 } 327 if !reflect.DeepEqual(w, r) { 328 t.Errorf("wrong values read from the page: got=%+v want=%+v", r, w) 329 } 330 } 331 332 func testFilePage(t *testing.T, schema *parquet.Schema, test pageTest) { 333 buffer := parquet.NewBuffer(schema) 334 column := buffer.ColumnBuffers()[0] 335 336 w, err := test.write(column) 337 if err != nil { 338 t.Fatal("writing page values:", err) 339 } 340 341 output := new(bytes.Buffer) 342 writer := parquet.NewWriter(output) 343 n, err := writer.WriteRowGroup(buffer) 344 if err != nil { 345 t.Fatal("writing parquet file:", err) 346 } 347 if err := writer.Close(); err != nil { 348 t.Fatal("writing parquet file:", err) 349 } 350 if n != buffer.NumRows() { 351 t.Fatalf("number of rows written mismatch: got=%d want=%d", n, buffer.NumRows()) 352 } 353 354 reader := bytes.NewReader(output.Bytes()) 355 f, err := parquet.OpenFile(reader, reader.Size()) 356 if err != nil { 357 t.Fatal("opening parquet file:", err) 358 } 359 360 pages := f.RowGroups()[0].ColumnChunks()[0].Pages() 361 defer pages.Close() 362 363 p, err := pages.ReadPage() 364 if err != nil { 365 t.Fatal("reading parquet page:", err) 366 } 367 368 values := p.Values() 369 r, err := test.read(values) 370 if err != io.EOF && err != nil { 371 t.Errorf("expected io.EOF after reading all values but got %v", err) 372 } 373 if !reflect.DeepEqual(w, r) { 374 t.Errorf("wrong values read from the page: got=%+v want=%+v", r, w) 375 } 376 if r, err := test.read(values); reflect.ValueOf(r).Len() != 0 || err != io.EOF { 377 t.Errorf("expected no data and io.EOF after reading all values but got %d and %v", r, err) 378 } 379 } 380 381 type testStruct struct { 382 Value *string 383 } 384 385 func TestOptionalPageTrailingNulls(t *testing.T) { 386 schema := parquet.SchemaOf(&testStruct{}) 387 buffer := parquet.NewBuffer(schema) 388 389 str := "test" 390 rows := []testStruct{{ 391 Value: nil, 392 }, { 393 Value: &str, 394 }, { 395 Value: nil, 396 }} 397 398 for _, row := range rows { 399 _, err := buffer.WriteRows([]parquet.Row{schema.Deconstruct(nil, row)}) 400 if err != nil { 401 t.Fatal("writing row:", err) 402 } 403 } 404 405 resultRows := make([]parquet.Row, 0, len(rows)) 406 bufferRows := make([]parquet.Row, 10) 407 reader := buffer.Rows() 408 defer reader.Close() 409 for { 410 n, err := reader.ReadRows(bufferRows) 411 resultRows = append(resultRows, bufferRows[:n]...) 412 if err != nil { 413 if err == io.EOF { 414 break 415 } 416 t.Fatal("reading rows:", err) 417 } 418 } 419 420 if len(resultRows) != len(rows) { 421 t.Errorf("wrong number of rows read: got=%d want=%d", len(resultRows), len(rows)) 422 } 423 } 424 425 func TestOptionalPagePreserveIndex(t *testing.T) { 426 schema := parquet.SchemaOf(&testStruct{}) 427 buffer := parquet.NewBuffer(schema) 428 429 _, err := buffer.WriteRows([]parquet.Row{ 430 schema.Deconstruct(nil, &testStruct{Value: nil}), 431 }) 432 if err != nil { 433 t.Fatal("writing row:", err) 434 } 435 436 rows := buffer.Rows() 437 defer rows.Close() 438 439 rowbuf := make([]parquet.Row, 2) 440 n, err := rows.ReadRows(rowbuf) 441 if err != io.EOF { 442 t.Fatal("reading rows:", err) 443 } 444 if n != 1 { 445 t.Fatal("wrong number of rows returned:", n) 446 } 447 if rowbuf[0][0].Column() != 0 { 448 t.Errorf("wrong index: got=%d want=%d", rowbuf[0][0].Column(), 0) 449 } 450 } 451 452 func TestRepeatedPageTrailingNulls(t *testing.T) { 453 type testStruct struct { 454 A []string `parquet:"a"` 455 } 456 457 s := parquet.SchemaOf(&testStruct{}) 458 459 records := []*testStruct{ 460 {A: nil}, 461 {A: []string{"test"}}, 462 {A: nil}, 463 } 464 465 buf := parquet.NewBuffer(s) 466 for _, rec := range records { 467 row := s.Deconstruct(nil, rec) 468 _, err := buf.WriteRows([]parquet.Row{row}) 469 if err != nil { 470 t.Fatal(err) 471 } 472 } 473 474 rows := make([]parquet.Row, len(records)+1) 475 reader := buf.Rows() 476 defer reader.Close() 477 478 n, err := reader.ReadRows(rows) 479 if err != io.EOF { 480 t.Fatal("reading rows:", err) 481 } 482 483 if n != len(records) { 484 t.Errorf("wrong number of rows read: got=%d want=%d", n, len(records)) 485 } 486 }