github.com/apache/arrow/go/v7@v7.0.1/parquet/file/column_reader_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package file_test 18 19 import ( 20 "math" 21 "math/rand" 22 "reflect" 23 "testing" 24 25 "github.com/apache/arrow/go/v7/arrow/memory" 26 "github.com/apache/arrow/go/v7/parquet" 27 "github.com/apache/arrow/go/v7/parquet/file" 28 "github.com/apache/arrow/go/v7/parquet/internal/testutils" 29 "github.com/apache/arrow/go/v7/parquet/internal/utils" 30 "github.com/apache/arrow/go/v7/parquet/schema" 31 "github.com/stretchr/testify/assert" 32 "github.com/stretchr/testify/suite" 33 ) 34 35 func initValues(values reflect.Value) { 36 if values.Kind() != reflect.Slice { 37 panic("must init values with slice") 38 } 39 40 r := rand.New(rand.NewSource(0)) 41 typ := values.Type().Elem() 42 switch { 43 case typ.Bits() <= 32: 44 max := int64(math.MaxInt32) 45 min := int64(math.MinInt32) 46 for i := 0; i < values.Len(); i++ { 47 values.Index(i).Set(reflect.ValueOf(r.Int63n(max-min+1) + min).Convert(reflect.TypeOf(int32(0)))) 48 } 49 case typ.Bits() <= 64: 50 max := int64(math.MaxInt64) 51 min := int64(math.MinInt64) 52 for i := 0; i < values.Len(); i++ { 53 values.Index(i).Set(reflect.ValueOf(r.Int63n(max-min+1) + min)) 54 } 55 } 56 } 57 58 func initDictValues(values reflect.Value, numDicts int) { 59 repeatFactor := values.Len() / numDicts 60 initValues(values) 61 // add some repeated values 62 for j := 1; j < repeatFactor; j++ { 63 for i := 0; i < numDicts; i++ { 64 values.Index(numDicts*j + i).Set(values.Index(i)) 65 } 66 } 67 // computed only dict_per_page * repeat_factor - 1 values < num_values compute remaining 68 for i := numDicts * repeatFactor; i < values.Len(); i++ { 69 values.Index(i).Set(values.Index(i - numDicts*repeatFactor)) 70 } 71 } 72 73 func makePages(version parquet.DataPageVersion, d *schema.Column, npages, lvlsPerPage int, typ reflect.Type, enc parquet.Encoding) ([]file.Page, int, reflect.Value, []int16, []int16) { 74 nlevels := lvlsPerPage * npages 75 nvalues := 0 76 77 maxDef := d.MaxDefinitionLevel() 78 maxRep := d.MaxRepetitionLevel() 79 80 var ( 81 defLevels []int16 82 repLevels []int16 83 ) 84 85 valuesPerPage := make([]int, npages) 86 if maxDef > 0 { 87 defLevels = make([]int16, nlevels) 88 testutils.FillRandomInt16(0, 0, maxDef, defLevels) 89 for idx := range valuesPerPage { 90 numPerPage := 0 91 for i := 0; i < lvlsPerPage; i++ { 92 if defLevels[i+idx*lvlsPerPage] == maxDef { 93 numPerPage++ 94 nvalues++ 95 } 96 } 97 valuesPerPage[idx] = numPerPage 98 } 99 } else { 100 nvalues = nlevels 101 valuesPerPage[0] = lvlsPerPage 102 for i := 1; i < len(valuesPerPage); i *= 2 { 103 copy(valuesPerPage[i:], valuesPerPage[:i]) 104 } 105 } 106 107 if maxRep > 0 { 108 repLevels = make([]int16, nlevels) 109 testutils.FillRandomInt16(0, 0, maxRep, repLevels) 110 } 111 112 values := reflect.MakeSlice(reflect.SliceOf(typ), nvalues, nvalues) 113 if enc == parquet.Encodings.Plain { 114 initValues(values) 115 return testutils.PaginatePlain(version, d, values, defLevels, repLevels, maxDef, maxRep, lvlsPerPage, valuesPerPage, parquet.Encodings.Plain), nvalues, values, defLevels, repLevels 116 } else if enc == parquet.Encodings.PlainDict || enc == parquet.Encodings.RLEDict { 117 initDictValues(values, lvlsPerPage) 118 return testutils.PaginateDict(version, d, values, defLevels, repLevels, maxDef, maxRep, lvlsPerPage, valuesPerPage, parquet.Encodings.RLEDict), nvalues, values, defLevels, repLevels 119 } 120 panic("invalid encoding type for make pages") 121 } 122 123 func compareVectorWithDefLevels(left, right reflect.Value, defLevels []int16, maxDef, maxRep int16) assert.Comparison { 124 return func() bool { 125 if left.Kind() != reflect.Slice || right.Kind() != reflect.Slice { 126 return false 127 } 128 129 if left.Type().Elem() != right.Type().Elem() { 130 return false 131 } 132 133 iLeft, iRight := 0, 0 134 for _, def := range defLevels { 135 if def == maxDef { 136 if !reflect.DeepEqual(left.Index(iLeft).Interface(), right.Index(iRight).Interface()) { 137 return false 138 } 139 iLeft++ 140 iRight++ 141 } else if def == (maxDef - 1) { 142 // null entry on the lowest nested level 143 iRight++ 144 } else if def < (maxDef - 1) { 145 // null entry on higher nesting level, only supported for non-repeating data 146 if maxRep == 0 { 147 iRight++ 148 } 149 } 150 } 151 return true 152 } 153 } 154 155 var mem = memory.DefaultAllocator 156 157 type PrimitiveReaderSuite struct { 158 suite.Suite 159 160 dataPageVersion parquet.DataPageVersion 161 pager file.PageReader 162 reader file.ColumnChunkReader 163 pages []file.Page 164 values reflect.Value 165 defLevels []int16 166 repLevels []int16 167 nlevels int 168 nvalues int 169 maxDefLvl int16 170 maxRepLvl int16 171 } 172 173 func (p *PrimitiveReaderSuite) TearDownTest() { 174 p.clear() 175 } 176 177 func (p *PrimitiveReaderSuite) initReader(d *schema.Column) { 178 m := new(testutils.MockPageReader) 179 m.Test(p.T()) 180 m.TestData().Set("pages", p.pages) 181 m.On("Err").Return((error)(nil)) 182 p.pager = m 183 p.reader = file.NewColumnReader(d, m, mem) 184 } 185 186 func (p *PrimitiveReaderSuite) checkResults() { 187 vresult := make([]int32, p.nvalues) 188 dresult := make([]int16, p.nlevels) 189 rresult := make([]int16, p.nlevels) 190 191 var ( 192 read int64 = 0 193 totalRead int = 0 194 batchActual int = 0 195 batchSize int32 = 8 196 batch int = 0 197 ) 198 199 rdr := p.reader.(*file.Int32ColumnChunkReader) 200 p.Require().NotNil(rdr) 201 202 // this will cover both cases: 203 // 1) batch size < page size (multiple ReadBatch from a single page) 204 // 2) batch size > page size (BatchRead limits to single page) 205 for { 206 read, batch, _ = rdr.ReadBatch(int64(batchSize), vresult[totalRead:], dresult[batchActual:], rresult[batchActual:]) 207 totalRead += batch 208 batchActual += int(read) 209 batchSize = int32(utils.MinInt(1<<24, utils.MaxInt(int(batchSize*2), 4096))) 210 if batch <= 0 { 211 break 212 } 213 } 214 215 p.Equal(p.nlevels, batchActual) 216 p.Equal(p.nvalues, totalRead) 217 p.Equal(p.values.Interface(), vresult) 218 if p.maxDefLvl > 0 { 219 p.Equal(p.defLevels, dresult) 220 } 221 if p.maxRepLvl > 0 { 222 p.Equal(p.repLevels, rresult) 223 } 224 225 // catch improper writes at EOS 226 read, batchActual, _ = rdr.ReadBatch(5, vresult, nil, nil) 227 p.Zero(batchActual) 228 p.Zero(read) 229 } 230 231 func (p *PrimitiveReaderSuite) clear() { 232 p.values = reflect.ValueOf(nil) 233 p.defLevels = nil 234 p.repLevels = nil 235 p.pages = nil 236 p.pager = nil 237 p.reader = nil 238 } 239 240 func (p *PrimitiveReaderSuite) testPlain(npages, levels int, d *schema.Column) { 241 p.pages, p.nvalues, p.values, p.defLevels, p.repLevels = makePages(p.dataPageVersion, d, npages, levels, reflect.TypeOf(int32(0)), parquet.Encodings.Plain) 242 p.nlevels = npages * levels 243 p.initReader(d) 244 p.checkResults() 245 p.clear() 246 } 247 248 func (p *PrimitiveReaderSuite) testDict(npages, levels int, d *schema.Column) { 249 p.pages, p.nvalues, p.values, p.defLevels, p.repLevels = makePages(p.dataPageVersion, d, npages, levels, reflect.TypeOf(int32(0)), parquet.Encodings.RLEDict) 250 p.nlevels = npages * levels 251 p.initReader(d) 252 p.checkResults() 253 p.clear() 254 } 255 256 func (p *PrimitiveReaderSuite) TestInt32FlatRequired() { 257 const ( 258 levelsPerPage int = 100 259 npages int = 50 260 ) 261 262 p.maxDefLvl = 0 263 p.maxRepLvl = 0 264 265 typ := schema.NewInt32Node("a", parquet.Repetitions.Required, -1) 266 d := schema.NewColumn(typ, p.maxDefLvl, p.maxRepLvl) 267 p.testPlain(npages, levelsPerPage, d) 268 p.testDict(npages, levelsPerPage, d) 269 } 270 271 func (p *PrimitiveReaderSuite) TestInt32FlatOptional() { 272 const ( 273 levelsPerPage int = 100 274 npages int = 50 275 ) 276 277 p.maxDefLvl = 4 278 p.maxRepLvl = 0 279 typ := schema.NewInt32Node("b", parquet.Repetitions.Optional, -1) 280 d := schema.NewColumn(typ, p.maxDefLvl, p.maxRepLvl) 281 p.testPlain(npages, levelsPerPage, d) 282 p.testDict(npages, levelsPerPage, d) 283 } 284 285 func (p *PrimitiveReaderSuite) TestInt32FlatRepeated() { 286 const ( 287 levelsPerPage int = 100 288 npages int = 50 289 ) 290 291 p.maxDefLvl = 4 292 p.maxRepLvl = 2 293 typ := schema.NewInt32Node("c", parquet.Repetitions.Repeated, -1) 294 d := schema.NewColumn(typ, p.maxDefLvl, p.maxRepLvl) 295 p.testPlain(npages, levelsPerPage, d) 296 p.testDict(npages, levelsPerPage, d) 297 } 298 299 func (p *PrimitiveReaderSuite) TestReadBatchMultiPage() { 300 const ( 301 levelsPerPage int = 100 302 npages int = 3 303 ) 304 305 p.maxDefLvl = 0 306 p.maxRepLvl = 0 307 typ := schema.NewInt32Node("a", parquet.Repetitions.Required, -1) 308 d := schema.NewColumn(typ, p.maxDefLvl, p.maxRepLvl) 309 p.pages, p.nvalues, p.values, p.defLevels, p.repLevels = makePages(p.dataPageVersion, d, npages, levelsPerPage, reflect.TypeOf(int32(0)), parquet.Encodings.Plain) 310 p.initReader(d) 311 312 vresult := make([]int32, levelsPerPage*npages) 313 dresult := make([]int16, levelsPerPage*npages) 314 rresult := make([]int16, levelsPerPage*npages) 315 316 rdr := p.reader.(*file.Int32ColumnChunkReader) 317 total, read, err := rdr.ReadBatch(int64(levelsPerPage*npages), vresult, dresult, rresult) 318 p.NoError(err) 319 p.EqualValues(levelsPerPage*npages, total) 320 p.EqualValues(levelsPerPage*npages, read) 321 } 322 323 func (p *PrimitiveReaderSuite) TestInt32FlatRequiredSkip() { 324 const ( 325 levelsPerPage int = 100 326 npages int = 5 327 ) 328 329 p.maxDefLvl = 0 330 p.maxRepLvl = 0 331 typ := schema.NewInt32Node("a", parquet.Repetitions.Required, -1) 332 d := schema.NewColumn(typ, p.maxDefLvl, p.maxRepLvl) 333 p.pages, p.nvalues, p.values, p.defLevels, p.repLevels = makePages(p.dataPageVersion, d, npages, levelsPerPage, reflect.TypeOf(int32(0)), parquet.Encodings.Plain) 334 p.initReader(d) 335 336 vresult := make([]int32, levelsPerPage/2) 337 dresult := make([]int16, levelsPerPage/2) 338 rresult := make([]int16, levelsPerPage/2) 339 340 rdr := p.reader.(*file.Int32ColumnChunkReader) 341 342 p.Run("skip_size > page_size", func() { 343 // Skip first 2 pages 344 skipped, _ := rdr.Skip(int64(2 * levelsPerPage)) 345 p.Equal(int64(2*levelsPerPage), skipped) 346 347 rdr.ReadBatch(int64(levelsPerPage/2), vresult, dresult, rresult) 348 subVals := p.values.Slice(2*levelsPerPage, int(2.5*float64(levelsPerPage))).Interface().([]int32) 349 p.Equal(subVals, vresult) 350 }) 351 352 p.Run("skip_size == page_size", func() { 353 // skip across two pages 354 skipped, _ := rdr.Skip(int64(levelsPerPage)) 355 p.Equal(int64(levelsPerPage), skipped) 356 // read half a page 357 rdr.ReadBatch(int64(levelsPerPage/2), vresult, dresult, rresult) 358 subVals := p.values.Slice(int(3.5*float64(levelsPerPage)), 4*levelsPerPage).Interface().([]int32) 359 p.Equal(subVals, vresult) 360 }) 361 362 p.Run("skip_size < page_size", func() { 363 // skip limited to a single page 364 // Skip half a page 365 skipped, _ := rdr.Skip(int64(levelsPerPage / 2)) 366 p.Equal(int64(0.5*float32(levelsPerPage)), skipped) 367 // Read half a page 368 rdr.ReadBatch(int64(levelsPerPage/2), vresult, dresult, rresult) 369 subVals := p.values.Slice(int(4.5*float64(levelsPerPage)), p.values.Len()).Interface().([]int32) 370 p.Equal(subVals, vresult) 371 }) 372 } 373 374 func (p *PrimitiveReaderSuite) TestDictionaryEncodedPages() { 375 p.maxDefLvl = 0 376 p.maxRepLvl = 0 377 typ := schema.NewInt32Node("a", parquet.Repetitions.Required, -1) 378 descr := schema.NewColumn(typ, p.maxDefLvl, p.maxRepLvl) 379 dummy := memory.NewResizableBuffer(mem) 380 381 p.Run("Dict: Plain, Data: RLEDict", func() { 382 dictPage := file.NewDictionaryPage(dummy, 0, parquet.Encodings.Plain) 383 dataPage := testutils.MakeDataPage(p.dataPageVersion, descr, nil, 0, parquet.Encodings.RLEDict, dummy, nil, nil, 0, 0) 384 385 p.pages = append(p.pages, dictPage, dataPage) 386 p.initReader(descr) 387 p.NotPanics(func() { p.reader.HasNext() }) 388 p.NoError(p.reader.Err()) 389 p.pages = p.pages[:0] 390 }) 391 392 p.Run("Dict: Plain Dictionary, Data: Plain Dictionary", func() { 393 dictPage := file.NewDictionaryPage(dummy, 0, parquet.Encodings.PlainDict) 394 dataPage := testutils.MakeDataPage(p.dataPageVersion, descr, nil, 0, parquet.Encodings.PlainDict, dummy, nil, nil, 0, 0) 395 p.pages = append(p.pages, dictPage, dataPage) 396 p.initReader(descr) 397 p.NotPanics(func() { p.reader.HasNext() }) 398 p.NoError(p.reader.Err()) 399 p.pages = p.pages[:0] 400 }) 401 402 p.Run("Panic if dict page not first", func() { 403 dataPage := testutils.MakeDataPage(p.dataPageVersion, descr, nil, 0, parquet.Encodings.RLEDict, dummy, nil, nil, 0, 0) 404 p.pages = append(p.pages, dataPage) 405 p.initReader(descr) 406 p.NotPanics(func() { p.False(p.reader.HasNext()) }) 407 p.Error(p.reader.Err()) 408 p.pages = p.pages[:0] 409 }) 410 411 p.Run("Only RLE is supported", func() { 412 dictPage := file.NewDictionaryPage(dummy, 0, parquet.Encodings.DeltaByteArray) 413 p.pages = append(p.pages, dictPage) 414 p.initReader(descr) 415 p.NotPanics(func() { p.False(p.reader.HasNext()) }) 416 p.Error(p.reader.Err()) 417 p.pages = p.pages[:0] 418 }) 419 420 p.Run("Cannot have more than one dict", func() { 421 dictPage1 := file.NewDictionaryPage(dummy, 0, parquet.Encodings.PlainDict) 422 dictPage2 := file.NewDictionaryPage(dummy, 0, parquet.Encodings.Plain) 423 p.pages = append(p.pages, dictPage1, dictPage2) 424 p.initReader(descr) 425 p.NotPanics(func() { p.False(p.reader.HasNext()) }) 426 p.Error(p.reader.Err()) 427 p.pages = p.pages[:0] 428 }) 429 430 p.Run("Unsupported encoding", func() { 431 dataPage := testutils.MakeDataPage(p.dataPageVersion, descr, nil, 0, parquet.Encodings.DeltaByteArray, dummy, nil, nil, 0, 0) 432 p.pages = append(p.pages, dataPage) 433 p.initReader(descr) 434 p.Panics(func() { p.reader.HasNext() }) 435 // p.Error(p.reader.Err()) 436 p.pages = p.pages[:0] 437 }) 438 439 p.pages = p.pages[:2] 440 } 441 442 func TestPrimitiveReader(t *testing.T) { 443 t.Parallel() 444 t.Run("datapage v1", func(t *testing.T) { 445 suite.Run(t, new(PrimitiveReaderSuite)) 446 }) 447 t.Run("datapage v2", func(t *testing.T) { 448 suite.Run(t, &PrimitiveReaderSuite{dataPageVersion: parquet.DataPageV2}) 449 }) 450 }