github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/parquet_test.go (about) 1 package parquet_test 2 3 import ( 4 "bytes" 5 "fmt" 6 "io" 7 "math/rand" 8 "reflect" 9 "strings" 10 "testing" 11 "time" 12 13 "github.com/google/uuid" 14 15 "github.com/segmentio/parquet-go" 16 "github.com/segmentio/parquet-go/deprecated" 17 "github.com/segmentio/parquet-go/internal/quick" 18 ) 19 20 const ( 21 benchmarkNumRows = 10_000 22 benchmarkRowsPerStep = 1000 23 ) 24 25 type benchmarkRowType struct { 26 ID [16]byte `parquet:"id,uuid"` 27 Value float64 `parquet:"value"` 28 } 29 30 func (row benchmarkRowType) generate(prng *rand.Rand) benchmarkRowType { 31 prng.Read(row.ID[:]) 32 row.Value = prng.Float64() 33 return row 34 } 35 36 type paddedBooleanColumn struct { 37 Value bool 38 _ [3]byte 39 } 40 41 func (row paddedBooleanColumn) generate(prng *rand.Rand) paddedBooleanColumn { 42 return paddedBooleanColumn{Value: prng.Int()%2 == 0} 43 } 44 45 type booleanColumn struct { 46 Value bool 47 } 48 49 func (row booleanColumn) generate(prng *rand.Rand) booleanColumn { 50 return booleanColumn{Value: prng.Int()%2 == 0} 51 } 52 53 type int32Column struct { 54 Value int32 `parquet:",delta"` 55 } 56 57 func (row int32Column) generate(prng *rand.Rand) int32Column { 58 return int32Column{Value: prng.Int31n(100)} 59 } 60 61 type int64Column struct { 62 Value int64 `parquet:",delta"` 63 } 64 65 func (row int64Column) generate(prng *rand.Rand) int64Column { 66 return int64Column{Value: prng.Int63n(100)} 67 } 68 69 type int96Column struct { 70 Value deprecated.Int96 71 } 72 73 func (row int96Column) generate(prng *rand.Rand) int96Column { 74 row.Value[0] = prng.Uint32() 75 row.Value[1] = prng.Uint32() 76 row.Value[2] = prng.Uint32() 77 return row 78 } 79 80 type floatColumn struct { 81 Value float32 82 } 83 84 func (row floatColumn) generate(prng *rand.Rand) floatColumn { 85 return floatColumn{Value: prng.Float32()} 86 } 87 88 type doubleColumn struct { 89 Value float64 90 } 91 92 func (row doubleColumn) generate(prng *rand.Rand) doubleColumn { 93 return doubleColumn{Value: prng.Float64()} 94 } 95 96 type byteArrayColumn struct { 97 Value []byte 98 } 99 100 func (row byteArrayColumn) generate(prng *rand.Rand) byteArrayColumn { 101 row.Value = make([]byte, prng.Intn(10)) 102 prng.Read(row.Value) 103 return row 104 } 105 106 type fixedLenByteArrayColumn struct { 107 Value [10]byte 108 } 109 110 func (row fixedLenByteArrayColumn) generate(prng *rand.Rand) fixedLenByteArrayColumn { 111 prng.Read(row.Value[:]) 112 return row 113 } 114 115 type stringColumn struct { 116 Value string 117 } 118 119 func (row stringColumn) generate(prng *rand.Rand) stringColumn { 120 return stringColumn{Value: generateString(prng, 10)} 121 } 122 123 type indexedStringColumn struct { 124 Value string `parquet:",dict"` 125 } 126 127 func (row indexedStringColumn) generate(prng *rand.Rand) indexedStringColumn { 128 return indexedStringColumn{Value: generateString(prng, 10)} 129 } 130 131 type uuidColumn struct { 132 Value uuid.UUID `parquet:",delta"` 133 } 134 135 func (row uuidColumn) generate(prng *rand.Rand) uuidColumn { 136 prng.Read(row.Value[:]) 137 return row 138 } 139 140 type timeColumn struct { 141 Value time.Time 142 } 143 144 func (row timeColumn) generate(prng *rand.Rand) timeColumn { 145 t := time.Unix(0, prng.Int63()).UTC() 146 return timeColumn{Value: t} 147 } 148 149 type timeInMillisColumn struct { 150 Value time.Time `parquet:",timestamp(millisecond)"` 151 } 152 153 func (row timeInMillisColumn) generate(prng *rand.Rand) timeInMillisColumn { 154 t := time.Unix(0, prng.Int63()).UTC() 155 return timeInMillisColumn{Value: t} 156 } 157 158 type decimalColumn struct { 159 Value int64 `parquet:",decimal(0:3)"` 160 } 161 162 func (row decimalColumn) generate(prng *rand.Rand) decimalColumn { 163 return decimalColumn{Value: prng.Int63()} 164 } 165 166 type mapColumn struct { 167 Value map[utf8string]int 168 } 169 170 func (row mapColumn) generate(prng *rand.Rand) mapColumn { 171 n := prng.Intn(10) 172 row.Value = make(map[utf8string]int, n) 173 for i := 0; i < n; i++ { 174 row.Value[utf8string(generateString(prng, 8))] = prng.Intn(100) 175 } 176 return row 177 } 178 179 type addressBook struct { 180 Owner utf8string `parquet:",plain"` 181 OwnerPhoneNumbers []utf8string `parquet:",plain"` 182 Contacts []contact 183 } 184 185 type contact struct { 186 Name utf8string `parquet:",plain"` 187 PhoneNumber utf8string `parquet:",plain"` 188 } 189 190 func (row contact) generate(prng *rand.Rand) contact { 191 return contact{ 192 Name: utf8string(generateString(prng, 16)), 193 PhoneNumber: utf8string(generateString(prng, 10)), 194 } 195 } 196 197 type optionalInt32Column struct { 198 Value int32 `parquet:",optional"` 199 } 200 201 func (row optionalInt32Column) generate(prng *rand.Rand) optionalInt32Column { 202 return optionalInt32Column{Value: prng.Int31n(100)} 203 } 204 205 type repeatedInt32Column struct { 206 Values []int32 207 } 208 209 func (row repeatedInt32Column) generate(prng *rand.Rand) repeatedInt32Column { 210 row.Values = make([]int32, prng.Intn(10)) 211 for i := range row.Values { 212 row.Values[i] = prng.Int31n(10) 213 } 214 return row 215 } 216 217 type listColumn2 struct { 218 Value utf8string `parquet:",optional"` 219 } 220 221 type listColumn1 struct { 222 List2 []listColumn2 `parquet:",list"` 223 } 224 225 type listColumn0 struct { 226 List1 []listColumn1 `parquet:",list"` 227 } 228 229 type nestedListColumn1 struct { 230 Level3 []utf8string `parquet:"level3"` 231 } 232 233 type nestedListColumn struct { 234 Level1 []nestedListColumn1 `parquet:"level1"` 235 Level2 []utf8string `parquet:"level2"` 236 } 237 238 type utf8string string 239 240 func (utf8string) Generate(rand *rand.Rand, size int) reflect.Value { 241 const characters = "abcdefghijklmnopqrstuvwxyz1234567890" 242 const maxSize = 10 243 if size > maxSize { 244 size = maxSize 245 } 246 n := rand.Intn(size) 247 b := make([]byte, n) 248 for i := range b { 249 b[i] = characters[rand.Intn(len(characters))] 250 } 251 return reflect.ValueOf(utf8string(b)) 252 } 253 254 type Contact struct { 255 Name string `parquet:"name"` 256 PhoneNumber string `parquet:"phoneNumber,optional,zstd"` 257 } 258 259 type AddressBook struct { 260 Owner string `parquet:"owner,zstd"` 261 OwnerPhoneNumbers []string `parquet:"ownerPhoneNumbers,gzip"` 262 Contacts []Contact `parquet:"contacts"` 263 } 264 265 func forEachLeafColumn(col *parquet.Column, do func(*parquet.Column) error) error { 266 children := col.Columns() 267 268 if len(children) == 0 { 269 return do(col) 270 } 271 272 for _, child := range children { 273 if err := forEachLeafColumn(child, do); err != nil { 274 return err 275 } 276 } 277 278 return nil 279 } 280 281 func forEachPage(pages parquet.PageReader, do func(parquet.Page) error) error { 282 doAndReleasePage := func(page parquet.Page) error { 283 defer parquet.Release(page) 284 return do(page) 285 } 286 287 for { 288 p, err := pages.ReadPage() 289 if err != nil { 290 if err == io.EOF { 291 err = nil 292 } 293 return err 294 } 295 if err := doAndReleasePage(p); err != nil { 296 return err 297 } 298 } 299 } 300 301 func forEachValue(values parquet.ValueReader, do func(parquet.Value) error) error { 302 buffer := [3]parquet.Value{} 303 for { 304 n, err := values.ReadValues(buffer[:]) 305 for _, v := range buffer[:n] { 306 if err := do(v); err != nil { 307 return err 308 } 309 } 310 if err != nil { 311 if err == io.EOF { 312 err = nil 313 } 314 return err 315 } 316 } 317 } 318 319 func forEachColumnPage(col *parquet.Column, do func(*parquet.Column, parquet.Page) error) error { 320 return forEachLeafColumn(col, func(leaf *parquet.Column) error { 321 pages := leaf.Pages() 322 defer pages.Close() 323 return forEachPage(pages, func(page parquet.Page) error { return do(leaf, page) }) 324 }) 325 } 326 327 func forEachColumnValue(col *parquet.Column, do func(*parquet.Column, parquet.Value) error) error { 328 return forEachColumnPage(col, func(leaf *parquet.Column, page parquet.Page) error { 329 return forEachValue(page.Values(), func(value parquet.Value) error { return do(leaf, value) }) 330 }) 331 } 332 333 func forEachColumnChunk(file *parquet.File, do func(*parquet.Column, parquet.ColumnChunk) error) error { 334 return forEachLeafColumn(file.Root(), func(leaf *parquet.Column) error { 335 for _, rowGroup := range file.RowGroups() { 336 if err := do(leaf, rowGroup.ColumnChunks()[leaf.Index()]); err != nil { 337 return err 338 } 339 } 340 return nil 341 }) 342 } 343 344 func createParquetFile(rows rows, options ...parquet.WriterOption) (*parquet.File, error) { 345 buffer := new(bytes.Buffer) 346 347 if err := writeParquetFile(buffer, rows, options...); err != nil { 348 return nil, err 349 } 350 351 reader := bytes.NewReader(buffer.Bytes()) 352 return parquet.OpenFile(reader, reader.Size()) 353 } 354 355 func writeParquetFile(w io.Writer, rows rows, options ...parquet.WriterOption) error { 356 writer := parquet.NewWriter(w, options...) 357 358 for _, row := range rows { 359 if err := writer.Write(row); err != nil { 360 return err 361 } 362 } 363 364 return writer.Close() 365 } 366 367 func writeParquetFileWithBuffer(w io.Writer, rows rows, options ...parquet.WriterOption) error { 368 buffer := parquet.NewBuffer() 369 for _, row := range rows { 370 if err := buffer.Write(row); err != nil { 371 return err 372 } 373 } 374 375 writer := parquet.NewWriter(w, options...) 376 numRows, err := copyRowsAndClose(writer, buffer.Rows()) 377 if err != nil { 378 return err 379 } 380 if numRows != int64(len(rows)) { 381 return fmt.Errorf("wrong number of rows written from buffer to file: want=%d got=%d", len(rows), numRows) 382 } 383 return writer.Close() 384 } 385 386 type rows []interface{} 387 388 func makeRows(any interface{}) rows { 389 if v, ok := any.([]interface{}); ok { 390 return rows(v) 391 } 392 value := reflect.ValueOf(any) 393 slice := make([]interface{}, value.Len()) 394 for i := range slice { 395 slice[i] = value.Index(i).Interface() 396 } 397 return rows(slice) 398 } 399 400 func randValueFuncOf(t parquet.Type) func(*rand.Rand) parquet.Value { 401 switch k := t.Kind(); k { 402 case parquet.Boolean: 403 return func(r *rand.Rand) parquet.Value { 404 return parquet.ValueOf(r.Float64() < 0.5) 405 } 406 407 case parquet.Int32: 408 return func(r *rand.Rand) parquet.Value { 409 return parquet.ValueOf(r.Int31()) 410 } 411 412 case parquet.Int64: 413 return func(r *rand.Rand) parquet.Value { 414 return parquet.ValueOf(r.Int63()) 415 } 416 417 case parquet.Int96: 418 return func(r *rand.Rand) parquet.Value { 419 return parquet.ValueOf(deprecated.Int96{ 420 0: r.Uint32(), 421 1: r.Uint32(), 422 2: r.Uint32(), 423 }) 424 } 425 426 case parquet.Float: 427 return func(r *rand.Rand) parquet.Value { 428 return parquet.ValueOf(r.Float32()) 429 } 430 431 case parquet.Double: 432 return func(r *rand.Rand) parquet.Value { 433 return parquet.ValueOf(r.Float64()) 434 } 435 436 case parquet.ByteArray: 437 return func(r *rand.Rand) parquet.Value { 438 n := r.Intn(49) + 1 439 b := make([]byte, n) 440 const characters = "1234567890qwertyuiopasdfghjklzxcvbnm " 441 for i := range b { 442 b[i] = characters[r.Intn(len(characters))] 443 } 444 return parquet.ValueOf(b) 445 } 446 447 case parquet.FixedLenByteArray: 448 arrayType := reflect.ArrayOf(t.Length(), reflect.TypeOf(byte(0))) 449 return func(r *rand.Rand) parquet.Value { 450 b := make([]byte, arrayType.Len()) 451 r.Read(b) 452 v := reflect.New(arrayType).Elem() 453 reflect.Copy(v, reflect.ValueOf(b)) 454 return parquet.ValueOf(v.Interface()) 455 } 456 457 default: 458 panic("NOT IMPLEMENTED") 459 } 460 } 461 462 func copyRowsAndClose(w parquet.RowWriter, r parquet.Rows) (int64, error) { 463 defer r.Close() 464 return parquet.CopyRows(w, r) 465 } 466 467 func benchmarkRowsPerSecond(b *testing.B, f func() int) { 468 b.ResetTimer() 469 start := time.Now() 470 numRows := int64(0) 471 472 for i := 0; i < b.N; i++ { 473 n := f() 474 numRows += int64(n) 475 } 476 477 seconds := time.Since(start).Seconds() 478 b.ReportMetric(float64(numRows)/seconds, "row/s") 479 } 480 481 func generateString(r *rand.Rand, n int) string { 482 const characters = "1234567890qwertyuiopasdfghjklzxcvbnm" 483 b := new(strings.Builder) 484 for i := 0; i < n; i++ { 485 b.WriteByte(characters[r.Intn(len(characters))]) 486 } 487 return b.String() 488 } 489 490 var quickCheckConfig = quick.Config{ 491 Sizes: []int{ 492 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 493 10, 20, 30, 40, 50, 123, 494 4096 + 1, 495 }, 496 } 497 498 func quickCheck(f interface{}) error { 499 return quickCheckConfig.Check(f) 500 }