github.com/fraugster/parquet-go@v0.12.0/data_store_test.go (about) 1 package goparquet 2 3 import ( 4 "testing" 5 6 "github.com/stretchr/testify/assert" 7 8 "github.com/stretchr/testify/require" 9 10 "github.com/fraugster/parquet-go/parquet" 11 ) 12 13 func newIntStore() *ColumnStore { 14 d := newStore(&int32Store{ColumnParameters: &ColumnParameters{}, stats: newInt32Stats(), pageStats: newInt32Stats()}, parquet.Encoding_PLAIN, false, nil) 15 return d 16 } 17 18 func TestOneColumn(t *testing.T) { 19 row := schema{} 20 require.NoError(t, row.AddColumn("DocID", NewDataColumn(newIntStore(), parquet.FieldRepetitionType_REQUIRED))) 21 row.resetData() 22 23 data := []map[string]interface{}{ 24 {"DocID": int32(10)}, 25 {"DocID": int32(20)}, 26 } 27 28 for i := range data { 29 require.NoError(t, row.AddData(data[i])) 30 } 31 d, err := row.findDataColumn("DocID") 32 require.NoError(t, err) 33 assert.Equal(t, uint16(0), d.MaxDefinitionLevel()) 34 assert.Equal(t, uint16(0), d.MaxRepetitionLevel()) 35 assert.Equal(t, []interface{}{int32(10), int32(20)}, d.data.values.getValues()) 36 assert.Equal(t, []int32{0, 0}, d.data.dLevels.toArray()) 37 assert.Equal(t, []int32{0, 0}, d.data.rLevels.toArray()) 38 39 // Now reading data 40 41 for i := range data { 42 read, err := row.getData() 43 require.NoError(t, err) 44 assert.Equal(t, data[i], read) 45 } 46 } 47 48 func TestOneColumnOptional(t *testing.T) { 49 row := schema{} 50 require.NoError(t, row.AddColumn("DocID", NewDataColumn(newIntStore(), parquet.FieldRepetitionType_OPTIONAL))) 51 row.resetData() 52 53 data := []map[string]interface{}{ 54 {"DocID": int32(10)}, 55 {}, 56 } 57 58 for i := range data { 59 require.NoError(t, row.AddData(data[i])) 60 } 61 d, err := row.findDataColumn("DocID") 62 require.NoError(t, err) 63 assert.Equal(t, uint16(1), d.MaxDefinitionLevel()) 64 assert.Equal(t, uint16(0), d.MaxRepetitionLevel()) 65 assert.Equal(t, []interface{}{int32(10)}, d.data.values.getValues()) 66 assert.Equal(t, []int32{1, 0}, d.data.dLevels.toArray()) 67 assert.Equal(t, []int32{0, 0}, d.data.rLevels.toArray()) 68 69 for i := range data { 70 read, err := row.getData() 71 require.NoError(t, err) 72 assert.Equal(t, data[i], read) 73 } 74 } 75 76 func TestOneColumnRepeated(t *testing.T) { 77 row := schema{} 78 require.NoError(t, row.AddColumn("DocID", NewDataColumn(newIntStore(), parquet.FieldRepetitionType_REPEATED))) 79 row.resetData() 80 81 data := []map[string]interface{}{ 82 {"DocID": []int32{10, 20}}, 83 {}, 84 } 85 86 for i := range data { 87 require.NoError(t, row.AddData(data[i])) 88 } 89 d, err := row.findDataColumn("DocID") 90 require.NoError(t, err) 91 assert.Equal(t, uint16(1), d.MaxDefinitionLevel()) 92 assert.Equal(t, uint16(1), d.MaxRepetitionLevel()) 93 assert.Equal(t, []interface{}{int32(10), int32(20)}, d.data.values.getValues()) 94 assert.Equal(t, []int32{1, 1, 0}, d.data.dLevels.toArray()) 95 assert.Equal(t, []int32{0, 1, 0}, d.data.rLevels.toArray()) 96 97 for i := range data { 98 read, err := row.getData() 99 require.NoError(t, err) 100 assert.Equal(t, data[i], read) 101 } 102 } 103 104 func TestComplexPart1(t *testing.T) { 105 row := &schema{} 106 require.NoError(t, row.AddGroupByPath(ColumnPath{"Name"}, parquet.FieldRepetitionType_REPEATED)) 107 require.NoError(t, row.AddGroupByPath(ColumnPath{"Name", "Language"}, parquet.FieldRepetitionType_REPEATED)) 108 require.NoError(t, row.AddColumnByPath(ColumnPath{"Name", "Language", "Code"}, NewDataColumn(newIntStore(), parquet.FieldRepetitionType_REQUIRED))) 109 require.NoError(t, row.AddColumnByPath(ColumnPath{"Name", "Language", "Country"}, NewDataColumn(newIntStore(), parquet.FieldRepetitionType_OPTIONAL))) 110 require.NoError(t, row.AddColumnByPath(ColumnPath{"Name", "URL"}, NewDataColumn(newIntStore(), parquet.FieldRepetitionType_OPTIONAL))) 111 112 row.resetData() 113 114 data := []map[string]interface{}{ 115 { 116 "Name": []map[string]interface{}{ 117 { 118 "Language": []map[string]interface{}{ 119 { 120 "Code": int32(1), 121 "Country": int32(100), 122 }, 123 { 124 "Code": int32(2), 125 }, 126 }, 127 "URL": int32(10), 128 }, 129 { 130 "URL": int32(11), 131 }, 132 { 133 "Language": []map[string]interface{}{ 134 { 135 "Code": int32(3), 136 "Country": int32(101), 137 }, 138 }, 139 }, 140 }, 141 }, 142 } 143 144 for i := range data { 145 require.NoError(t, row.AddData(data[i])) 146 } 147 148 d, err := row.findDataColumn("Name.Language.Code") 149 require.NoError(t, err) 150 assert.Equal(t, uint16(2), d.MaxDefinitionLevel()) 151 assert.Equal(t, uint16(2), d.MaxRepetitionLevel()) 152 assert.Equal(t, []interface{}{int32(1), int32(2), int32(3)}, d.data.values.getValues()) 153 assert.Equal(t, []int32{2, 2, 1, 2}, d.data.dLevels.toArray()) 154 assert.Equal(t, []int32{0, 2, 1, 1}, d.data.rLevels.toArray()) 155 156 d, err = row.findDataColumn("Name.Language.Country") 157 require.NoError(t, err) 158 assert.Equal(t, uint16(3), d.MaxDefinitionLevel()) 159 assert.Equal(t, uint16(2), d.MaxRepetitionLevel()) 160 assert.Equal(t, []interface{}{int32(100), int32(101)}, d.data.values.getValues()) 161 assert.Equal(t, []int32{3, 2, 1, 3}, d.data.dLevels.toArray()) 162 assert.Equal(t, []int32{0, 2, 1, 1}, d.data.rLevels.toArray()) 163 164 d, err = row.findDataColumn("Name.URL") 165 require.NoError(t, err) 166 assert.Equal(t, uint16(2), d.MaxDefinitionLevel()) 167 assert.Equal(t, uint16(1), d.MaxRepetitionLevel()) 168 assert.Equal(t, []interface{}{int32(10), int32(11)}, d.data.values.getValues()) 169 assert.Equal(t, []int32{2, 2, 1}, d.data.dLevels.toArray()) 170 assert.Equal(t, []int32{0, 1, 1}, d.data.rLevels.toArray()) 171 172 for i := range data { 173 read, err := row.getData() 174 require.NoError(t, err) 175 assert.Equal(t, data[i], read) 176 } 177 } 178 179 func TestComplexPart2(t *testing.T) { 180 row := &schema{} 181 require.NoError(t, row.AddGroupByPath(ColumnPath{"Links"}, parquet.FieldRepetitionType_OPTIONAL)) 182 require.NoError(t, row.AddColumnByPath(ColumnPath{"Links", "Backward"}, NewDataColumn(newIntStore(), parquet.FieldRepetitionType_REPEATED))) 183 require.NoError(t, row.AddColumnByPath(ColumnPath{"Links", "Forward"}, NewDataColumn(newIntStore(), parquet.FieldRepetitionType_REPEATED))) 184 row.resetData() 185 186 data := []map[string]interface{}{ 187 { 188 "Links": map[string]interface{}{ 189 "Forward": []int32{20, 40, 60}, 190 }, 191 }, 192 { 193 "Links": map[string]interface{}{ 194 "Backward": []int32{10, 30}, 195 "Forward": []int32{80}, 196 }, 197 }, 198 } 199 200 for i := range data { 201 require.NoError(t, row.AddData(data[i])) 202 } 203 204 d, err := row.findDataColumn("Links.Forward") 205 require.NoError(t, err) 206 assert.Equal(t, uint16(2), d.MaxDefinitionLevel()) 207 assert.Equal(t, uint16(1), d.MaxRepetitionLevel()) 208 assert.Equal(t, []interface{}{int32(20), int32(40), int32(60), int32(80)}, d.data.values.getValues()) 209 assert.Equal(t, []int32{2, 2, 2, 2}, d.data.dLevels.toArray()) 210 assert.Equal(t, []int32{0, 1, 1, 0}, d.data.rLevels.toArray()) 211 212 d, err = row.findDataColumn("Links.Backward") 213 require.NoError(t, err) 214 assert.Equal(t, uint16(2), d.MaxDefinitionLevel()) 215 assert.Equal(t, uint16(1), d.MaxRepetitionLevel()) 216 assert.Equal(t, []interface{}{int32(10), int32(30)}, d.data.values.getValues()) 217 assert.Equal(t, []int32{1, 2, 2}, d.data.dLevels.toArray()) 218 assert.Equal(t, []int32{0, 0, 1}, d.data.rLevels.toArray()) 219 220 for i := range data { 221 read, err := row.getData() 222 require.NoError(t, err) 223 assert.Equal(t, data[i], read) 224 } 225 } 226 227 func TestComplex(t *testing.T) { 228 // Based on this picture https://i.stack.imgur.com/raOFu.png from this doc https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/36632.pdf 229 row := &schema{} 230 require.NoError(t, row.AddColumn("DocId", NewDataColumn(newIntStore(), parquet.FieldRepetitionType_REQUIRED))) 231 require.NoError(t, row.AddGroupByPath(ColumnPath{"Links"}, parquet.FieldRepetitionType_OPTIONAL)) 232 require.NoError(t, row.AddColumn("Links.Backward", NewDataColumn(newIntStore(), parquet.FieldRepetitionType_REPEATED))) 233 require.NoError(t, row.AddColumn("Links.Forward", NewDataColumn(newIntStore(), parquet.FieldRepetitionType_REPEATED))) 234 require.NoError(t, row.AddGroupByPath(ColumnPath{"Name"}, parquet.FieldRepetitionType_REPEATED)) 235 require.NoError(t, row.AddGroupByPath(ColumnPath{"Name", "Language"}, parquet.FieldRepetitionType_REPEATED)) 236 require.NoError(t, row.AddColumn("Name.Language.Code", NewDataColumn(newIntStore(), parquet.FieldRepetitionType_REQUIRED))) 237 require.NoError(t, row.AddColumn("Name.Language.Country", NewDataColumn(newIntStore(), parquet.FieldRepetitionType_OPTIONAL))) 238 require.NoError(t, row.AddColumn("Name.URL", NewDataColumn(newIntStore(), parquet.FieldRepetitionType_OPTIONAL))) 239 row.resetData() 240 241 data := []map[string]interface{}{ 242 { 243 "DocId": int32(10), 244 "Links": map[string]interface{}{ 245 "Forward": []int32{20, 40, 60}, 246 }, 247 "Name": []map[string]interface{}{ 248 { 249 "Language": []map[string]interface{}{ 250 { 251 "Code": int32(1), 252 "Country": int32(100), 253 }, 254 { 255 "Code": int32(2), 256 }, 257 }, 258 "URL": int32(10), 259 }, 260 { 261 "URL": int32(11), 262 }, 263 { 264 "Language": []map[string]interface{}{ 265 { 266 "Code": int32(3), 267 "Country": int32(101), 268 }, 269 }, 270 }, 271 }, 272 }, 273 { 274 "DocId": int32(20), 275 "Links": map[string]interface{}{ 276 "Backward": []int32{10, 30}, 277 "Forward": []int32{80}, 278 }, 279 "Name": []map[string]interface{}{ 280 { 281 "URL": int32(12), 282 }, 283 }, 284 }, 285 } 286 287 for i := range data { 288 require.NoError(t, row.AddData(data[i])) 289 } 290 291 d, err := row.findDataColumn("DocId") 292 require.NoError(t, err) 293 assert.Equal(t, uint16(0), d.MaxDefinitionLevel()) 294 assert.Equal(t, uint16(0), d.MaxRepetitionLevel()) 295 assert.Equal(t, []interface{}{int32(10), int32(20)}, d.data.values.getValues()) 296 assert.Equal(t, []int32{0, 0}, d.data.dLevels.toArray()) 297 assert.Equal(t, []int32{0, 0}, d.data.rLevels.toArray()) 298 299 d, err = row.findDataColumn("Name.URL") 300 require.NoError(t, err) 301 assert.Equal(t, uint16(2), d.MaxDefinitionLevel()) 302 assert.Equal(t, uint16(1), d.MaxRepetitionLevel()) 303 assert.Equal(t, []interface{}{int32(10), int32(11), int32(12)}, d.data.values.getValues()) 304 assert.Equal(t, []int32{2, 2, 1, 2}, d.data.dLevels.toArray()) 305 assert.Equal(t, []int32{0, 1, 1, 0}, d.data.rLevels.toArray()) 306 307 d, err = row.findDataColumn("Links.Forward") 308 require.NoError(t, err) 309 assert.Equal(t, uint16(2), d.MaxDefinitionLevel()) 310 assert.Equal(t, uint16(1), d.MaxRepetitionLevel()) 311 assert.Equal(t, []interface{}{int32(20), int32(40), int32(60), int32(80)}, d.data.values.getValues()) 312 assert.Equal(t, []int32{2, 2, 2, 2}, d.data.dLevels.toArray()) 313 assert.Equal(t, []int32{0, 1, 1, 0}, d.data.rLevels.toArray()) 314 315 d, err = row.findDataColumn("Links.Backward") 316 require.NoError(t, err) 317 assert.Equal(t, uint16(2), d.MaxDefinitionLevel()) 318 assert.Equal(t, uint16(1), d.MaxRepetitionLevel()) 319 assert.Equal(t, []interface{}{int32(10), int32(30)}, d.data.values.getValues()) 320 assert.Equal(t, []int32{1, 2, 2}, d.data.dLevels.toArray()) 321 assert.Equal(t, []int32{0, 0, 1}, d.data.rLevels.toArray()) 322 323 d, err = row.findDataColumn("Name.Language.Country") 324 require.NoError(t, err) 325 assert.Equal(t, uint16(3), d.MaxDefinitionLevel()) 326 assert.Equal(t, uint16(2), d.MaxRepetitionLevel()) 327 assert.Equal(t, []interface{}{int32(100), int32(101)}, d.data.values.getValues()) 328 assert.Equal(t, []int32{3, 2, 1, 3, 1}, d.data.dLevels.toArray()) 329 assert.Equal(t, []int32{0, 2, 1, 1, 0}, d.data.rLevels.toArray()) 330 331 d, err = row.findDataColumn("Name.Language.Code") 332 require.NoError(t, err) 333 assert.Equal(t, uint16(2), d.MaxDefinitionLevel()) 334 assert.Equal(t, uint16(2), d.MaxRepetitionLevel()) 335 assert.Equal(t, []interface{}{int32(1), int32(2), int32(3)}, d.data.values.getValues()) 336 assert.Equal(t, []int32{2, 2, 1, 2, 1}, d.data.dLevels.toArray()) 337 assert.Equal(t, []int32{0, 2, 1, 1, 0}, d.data.rLevels.toArray()) 338 339 for i := range data { 340 read, err := row.getData() 341 require.NoError(t, err) 342 assert.Equal(t, data[i], read) 343 } 344 } 345 346 func TestTwitterBlog(t *testing.T) { 347 // Sample from here https://blog.twitter.com/engineering/en_us/a/2013/dremel-made-simple-with-parquet.html 348 row := &schema{} 349 require.NoError(t, row.AddGroupByPath(ColumnPath{"level1"}, parquet.FieldRepetitionType_REPEATED)) 350 require.NoError(t, row.AddColumn("level1.level2", NewDataColumn(newIntStore(), parquet.FieldRepetitionType_REPEATED))) 351 row.resetData() 352 353 data := []map[string]interface{}{ 354 { 355 "level1": []map[string]interface{}{ 356 {"level2": []int32{1, 2, 3}}, 357 {"level2": []int32{4, 5, 6, 7}}, 358 }, 359 }, 360 { 361 "level1": []map[string]interface{}{ 362 {"level2": []int32{8}}, 363 {"level2": []int32{9, 10}}, 364 }, 365 }, 366 } 367 368 for i := range data { 369 require.NoError(t, row.AddData(data[i])) 370 } 371 372 d, err := row.findDataColumn("level1.level2") 373 require.NoError(t, err) 374 var expected []interface{} 375 for i := 1; i < 11; i++ { 376 expected = append(expected, int32(i)) 377 } 378 assert.Equal(t, expected, d.data.values.getValues()) 379 assert.Equal(t, uint16(2), d.MaxDefinitionLevel()) 380 assert.Equal(t, uint16(2), d.MaxRepetitionLevel()) 381 assert.Equal(t, []int32{0, 2, 2, 1, 2, 2, 2, 0, 1, 2}, d.data.rLevels.toArray()) 382 assert.Equal(t, []int32{2, 2, 2, 2, 2, 2, 2, 2, 2, 2}, d.data.dLevels.toArray()) 383 384 for i := range data { 385 read, err := row.getData() 386 require.NoError(t, err) 387 assert.Equal(t, data[i], read) 388 } 389 } 390 391 func TestEmptyParent(t *testing.T) { 392 elementStore, err := NewInt32Store(parquet.Encoding_PLAIN, true, &ColumnParameters{}) 393 require.NoError(t, err, "failed to create elementStore") 394 395 elementCol := NewDataColumn(elementStore, parquet.FieldRepetitionType_REQUIRED) 396 list, err := NewListColumn(elementCol, parquet.FieldRepetitionType_OPTIONAL) 397 require.NoError(t, err) 398 399 row := &schema{} 400 require.NoError(t, row.AddColumn("baz", list)) 401 row.resetData() 402 data := []map[string]interface{}{ 403 { 404 "baz": map[string]interface{}{}, 405 }, 406 } 407 408 for i := range data { 409 require.NoError(t, row.AddData(data[i])) 410 } 411 412 col, err := row.findDataColumn("baz.list.element") 413 require.NoError(t, err) 414 415 assert.Equal(t, []interface{}(nil), col.data.values.getValues()) 416 417 assert.Equal(t, uint16(2), col.MaxDefinitionLevel()) 418 assert.Equal(t, uint16(1), col.MaxRepetitionLevel()) 419 require.Equal(t, []int32{0}, col.data.rLevels.toArray()) 420 require.Equal(t, []int32{1}, col.data.dLevels.toArray()) 421 422 for i := range data { 423 read, err := row.getData() 424 require.NoError(t, err) 425 assert.Equal(t, data[i], read) 426 } 427 } 428 429 func TestZeroRL(t *testing.T) { 430 row := &schema{} 431 //message test_msg { 432 // required group baz (LIST) { 433 // repeated group list { 434 // required group element { 435 // required int64 quux; 436 // } 437 // } 438 // } 439 // } 440 require.NoError(t, row.AddGroupByPath(ColumnPath{"baz"}, parquet.FieldRepetitionType_REQUIRED)) 441 require.NoError(t, row.AddGroupByPath(ColumnPath{"baz", "list"}, parquet.FieldRepetitionType_REPEATED)) 442 require.NoError(t, row.AddGroupByPath(ColumnPath{"baz", "list", "element"}, parquet.FieldRepetitionType_REQUIRED)) 443 require.NoError(t, row.AddColumn("baz.list.element.quux", NewDataColumn(newIntStore(), parquet.FieldRepetitionType_REQUIRED))) 444 row.resetData() 445 446 data := map[string]interface{}{ 447 "baz": map[string]interface{}{ 448 "list": []map[string]interface{}{ 449 { 450 "element": map[string]interface{}{ 451 "quux": int32(23), 452 }, 453 }, 454 { 455 "element": map[string]interface{}{ 456 "quux": int32(42), 457 }, 458 }, 459 }, 460 }, 461 } 462 463 require.NoError(t, row.AddData(data)) 464 465 d, err := row.findDataColumn("baz.list.element.quux") 466 require.NoError(t, err) 467 var expected = []interface{}{int32(23), int32(42)} 468 assert.Equal(t, expected, d.data.values.getValues()) 469 assert.Equal(t, uint16(1), d.MaxDefinitionLevel()) 470 assert.Equal(t, uint16(1), d.MaxRepetitionLevel()) 471 assert.Equal(t, []int32{0, 1}, d.data.rLevels.toArray()) 472 assert.Equal(t, []int32{1, 1}, d.data.dLevels.toArray()) 473 474 read, err := row.getData() 475 require.NoError(t, err) 476 assert.Equal(t, data, read) 477 478 row = &schema{} 479 require.NoError(t, row.AddGroupByPath(ColumnPath{"baz"}, parquet.FieldRepetitionType_REQUIRED)) 480 require.NoError(t, row.AddGroupByPath(ColumnPath{"baz", "list"}, parquet.FieldRepetitionType_REPEATED)) 481 require.NoError(t, row.AddGroupByPath(ColumnPath{"baz", "list", "element"}, parquet.FieldRepetitionType_REQUIRED)) 482 require.NoError(t, row.AddColumn("baz.list.element.quux", NewDataColumn(newIntStore(), parquet.FieldRepetitionType_OPTIONAL))) 483 row.resetData() 484 require.NoError(t, row.AddData(data)) 485 486 d, err = row.findDataColumn("baz.list.element.quux") 487 require.NoError(t, err) 488 assert.Equal(t, expected, d.data.values.getValues()) 489 assert.Equal(t, uint16(2), d.MaxDefinitionLevel()) 490 assert.Equal(t, uint16(1), d.MaxRepetitionLevel()) 491 assert.Equal(t, []int32{0, 1}, d.data.rLevels.toArray()) 492 assert.Equal(t, []int32{2, 2}, d.data.dLevels.toArray()) 493 494 read, err = row.getData() 495 require.NoError(t, err) 496 assert.Equal(t, data, read) 497 }