github.com/apache/arrow/go/v7@v7.0.1/parquet/file/column_reader_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package file_test
    18  
    19  import (
    20  	"math"
    21  	"math/rand"
    22  	"reflect"
    23  	"testing"
    24  
    25  	"github.com/apache/arrow/go/v7/arrow/memory"
    26  	"github.com/apache/arrow/go/v7/parquet"
    27  	"github.com/apache/arrow/go/v7/parquet/file"
    28  	"github.com/apache/arrow/go/v7/parquet/internal/testutils"
    29  	"github.com/apache/arrow/go/v7/parquet/internal/utils"
    30  	"github.com/apache/arrow/go/v7/parquet/schema"
    31  	"github.com/stretchr/testify/assert"
    32  	"github.com/stretchr/testify/suite"
    33  )
    34  
    35  func initValues(values reflect.Value) {
    36  	if values.Kind() != reflect.Slice {
    37  		panic("must init values with slice")
    38  	}
    39  
    40  	r := rand.New(rand.NewSource(0))
    41  	typ := values.Type().Elem()
    42  	switch {
    43  	case typ.Bits() <= 32:
    44  		max := int64(math.MaxInt32)
    45  		min := int64(math.MinInt32)
    46  		for i := 0; i < values.Len(); i++ {
    47  			values.Index(i).Set(reflect.ValueOf(r.Int63n(max-min+1) + min).Convert(reflect.TypeOf(int32(0))))
    48  		}
    49  	case typ.Bits() <= 64:
    50  		max := int64(math.MaxInt64)
    51  		min := int64(math.MinInt64)
    52  		for i := 0; i < values.Len(); i++ {
    53  			values.Index(i).Set(reflect.ValueOf(r.Int63n(max-min+1) + min))
    54  		}
    55  	}
    56  }
    57  
    58  func initDictValues(values reflect.Value, numDicts int) {
    59  	repeatFactor := values.Len() / numDicts
    60  	initValues(values)
    61  	// add some repeated values
    62  	for j := 1; j < repeatFactor; j++ {
    63  		for i := 0; i < numDicts; i++ {
    64  			values.Index(numDicts*j + i).Set(values.Index(i))
    65  		}
    66  	}
    67  	// computed only dict_per_page * repeat_factor - 1 values < num_values compute remaining
    68  	for i := numDicts * repeatFactor; i < values.Len(); i++ {
    69  		values.Index(i).Set(values.Index(i - numDicts*repeatFactor))
    70  	}
    71  }
    72  
    73  func makePages(version parquet.DataPageVersion, d *schema.Column, npages, lvlsPerPage int, typ reflect.Type, enc parquet.Encoding) ([]file.Page, int, reflect.Value, []int16, []int16) {
    74  	nlevels := lvlsPerPage * npages
    75  	nvalues := 0
    76  
    77  	maxDef := d.MaxDefinitionLevel()
    78  	maxRep := d.MaxRepetitionLevel()
    79  
    80  	var (
    81  		defLevels []int16
    82  		repLevels []int16
    83  	)
    84  
    85  	valuesPerPage := make([]int, npages)
    86  	if maxDef > 0 {
    87  		defLevels = make([]int16, nlevels)
    88  		testutils.FillRandomInt16(0, 0, maxDef, defLevels)
    89  		for idx := range valuesPerPage {
    90  			numPerPage := 0
    91  			for i := 0; i < lvlsPerPage; i++ {
    92  				if defLevels[i+idx*lvlsPerPage] == maxDef {
    93  					numPerPage++
    94  					nvalues++
    95  				}
    96  			}
    97  			valuesPerPage[idx] = numPerPage
    98  		}
    99  	} else {
   100  		nvalues = nlevels
   101  		valuesPerPage[0] = lvlsPerPage
   102  		for i := 1; i < len(valuesPerPage); i *= 2 {
   103  			copy(valuesPerPage[i:], valuesPerPage[:i])
   104  		}
   105  	}
   106  
   107  	if maxRep > 0 {
   108  		repLevels = make([]int16, nlevels)
   109  		testutils.FillRandomInt16(0, 0, maxRep, repLevels)
   110  	}
   111  
   112  	values := reflect.MakeSlice(reflect.SliceOf(typ), nvalues, nvalues)
   113  	if enc == parquet.Encodings.Plain {
   114  		initValues(values)
   115  		return testutils.PaginatePlain(version, d, values, defLevels, repLevels, maxDef, maxRep, lvlsPerPage, valuesPerPage, parquet.Encodings.Plain), nvalues, values, defLevels, repLevels
   116  	} else if enc == parquet.Encodings.PlainDict || enc == parquet.Encodings.RLEDict {
   117  		initDictValues(values, lvlsPerPage)
   118  		return testutils.PaginateDict(version, d, values, defLevels, repLevels, maxDef, maxRep, lvlsPerPage, valuesPerPage, parquet.Encodings.RLEDict), nvalues, values, defLevels, repLevels
   119  	}
   120  	panic("invalid encoding type for make pages")
   121  }
   122  
   123  func compareVectorWithDefLevels(left, right reflect.Value, defLevels []int16, maxDef, maxRep int16) assert.Comparison {
   124  	return func() bool {
   125  		if left.Kind() != reflect.Slice || right.Kind() != reflect.Slice {
   126  			return false
   127  		}
   128  
   129  		if left.Type().Elem() != right.Type().Elem() {
   130  			return false
   131  		}
   132  
   133  		iLeft, iRight := 0, 0
   134  		for _, def := range defLevels {
   135  			if def == maxDef {
   136  				if !reflect.DeepEqual(left.Index(iLeft).Interface(), right.Index(iRight).Interface()) {
   137  					return false
   138  				}
   139  				iLeft++
   140  				iRight++
   141  			} else if def == (maxDef - 1) {
   142  				// null entry on the lowest nested level
   143  				iRight++
   144  			} else if def < (maxDef - 1) {
   145  				// null entry on higher nesting level, only supported for non-repeating data
   146  				if maxRep == 0 {
   147  					iRight++
   148  				}
   149  			}
   150  		}
   151  		return true
   152  	}
   153  }
   154  
   155  var mem = memory.DefaultAllocator
   156  
   157  type PrimitiveReaderSuite struct {
   158  	suite.Suite
   159  
   160  	dataPageVersion parquet.DataPageVersion
   161  	pager           file.PageReader
   162  	reader          file.ColumnChunkReader
   163  	pages           []file.Page
   164  	values          reflect.Value
   165  	defLevels       []int16
   166  	repLevels       []int16
   167  	nlevels         int
   168  	nvalues         int
   169  	maxDefLvl       int16
   170  	maxRepLvl       int16
   171  }
   172  
   173  func (p *PrimitiveReaderSuite) TearDownTest() {
   174  	p.clear()
   175  }
   176  
   177  func (p *PrimitiveReaderSuite) initReader(d *schema.Column) {
   178  	m := new(testutils.MockPageReader)
   179  	m.Test(p.T())
   180  	m.TestData().Set("pages", p.pages)
   181  	m.On("Err").Return((error)(nil))
   182  	p.pager = m
   183  	p.reader = file.NewColumnReader(d, m, mem)
   184  }
   185  
   186  func (p *PrimitiveReaderSuite) checkResults() {
   187  	vresult := make([]int32, p.nvalues)
   188  	dresult := make([]int16, p.nlevels)
   189  	rresult := make([]int16, p.nlevels)
   190  
   191  	var (
   192  		read        int64 = 0
   193  		totalRead   int   = 0
   194  		batchActual int   = 0
   195  		batchSize   int32 = 8
   196  		batch       int   = 0
   197  	)
   198  
   199  	rdr := p.reader.(*file.Int32ColumnChunkReader)
   200  	p.Require().NotNil(rdr)
   201  
   202  	// this will cover both cases:
   203  	// 1) batch size < page size (multiple ReadBatch from a single page)
   204  	// 2) batch size > page size (BatchRead limits to single page)
   205  	for {
   206  		read, batch, _ = rdr.ReadBatch(int64(batchSize), vresult[totalRead:], dresult[batchActual:], rresult[batchActual:])
   207  		totalRead += batch
   208  		batchActual += int(read)
   209  		batchSize = int32(utils.MinInt(1<<24, utils.MaxInt(int(batchSize*2), 4096)))
   210  		if batch <= 0 {
   211  			break
   212  		}
   213  	}
   214  
   215  	p.Equal(p.nlevels, batchActual)
   216  	p.Equal(p.nvalues, totalRead)
   217  	p.Equal(p.values.Interface(), vresult)
   218  	if p.maxDefLvl > 0 {
   219  		p.Equal(p.defLevels, dresult)
   220  	}
   221  	if p.maxRepLvl > 0 {
   222  		p.Equal(p.repLevels, rresult)
   223  	}
   224  
   225  	// catch improper writes at EOS
   226  	read, batchActual, _ = rdr.ReadBatch(5, vresult, nil, nil)
   227  	p.Zero(batchActual)
   228  	p.Zero(read)
   229  }
   230  
   231  func (p *PrimitiveReaderSuite) clear() {
   232  	p.values = reflect.ValueOf(nil)
   233  	p.defLevels = nil
   234  	p.repLevels = nil
   235  	p.pages = nil
   236  	p.pager = nil
   237  	p.reader = nil
   238  }
   239  
   240  func (p *PrimitiveReaderSuite) testPlain(npages, levels int, d *schema.Column) {
   241  	p.pages, p.nvalues, p.values, p.defLevels, p.repLevels = makePages(p.dataPageVersion, d, npages, levels, reflect.TypeOf(int32(0)), parquet.Encodings.Plain)
   242  	p.nlevels = npages * levels
   243  	p.initReader(d)
   244  	p.checkResults()
   245  	p.clear()
   246  }
   247  
   248  func (p *PrimitiveReaderSuite) testDict(npages, levels int, d *schema.Column) {
   249  	p.pages, p.nvalues, p.values, p.defLevels, p.repLevels = makePages(p.dataPageVersion, d, npages, levels, reflect.TypeOf(int32(0)), parquet.Encodings.RLEDict)
   250  	p.nlevels = npages * levels
   251  	p.initReader(d)
   252  	p.checkResults()
   253  	p.clear()
   254  }
   255  
   256  func (p *PrimitiveReaderSuite) TestInt32FlatRequired() {
   257  	const (
   258  		levelsPerPage int = 100
   259  		npages        int = 50
   260  	)
   261  
   262  	p.maxDefLvl = 0
   263  	p.maxRepLvl = 0
   264  
   265  	typ := schema.NewInt32Node("a", parquet.Repetitions.Required, -1)
   266  	d := schema.NewColumn(typ, p.maxDefLvl, p.maxRepLvl)
   267  	p.testPlain(npages, levelsPerPage, d)
   268  	p.testDict(npages, levelsPerPage, d)
   269  }
   270  
   271  func (p *PrimitiveReaderSuite) TestInt32FlatOptional() {
   272  	const (
   273  		levelsPerPage int = 100
   274  		npages        int = 50
   275  	)
   276  
   277  	p.maxDefLvl = 4
   278  	p.maxRepLvl = 0
   279  	typ := schema.NewInt32Node("b", parquet.Repetitions.Optional, -1)
   280  	d := schema.NewColumn(typ, p.maxDefLvl, p.maxRepLvl)
   281  	p.testPlain(npages, levelsPerPage, d)
   282  	p.testDict(npages, levelsPerPage, d)
   283  }
   284  
   285  func (p *PrimitiveReaderSuite) TestInt32FlatRepeated() {
   286  	const (
   287  		levelsPerPage int = 100
   288  		npages        int = 50
   289  	)
   290  
   291  	p.maxDefLvl = 4
   292  	p.maxRepLvl = 2
   293  	typ := schema.NewInt32Node("c", parquet.Repetitions.Repeated, -1)
   294  	d := schema.NewColumn(typ, p.maxDefLvl, p.maxRepLvl)
   295  	p.testPlain(npages, levelsPerPage, d)
   296  	p.testDict(npages, levelsPerPage, d)
   297  }
   298  
   299  func (p *PrimitiveReaderSuite) TestReadBatchMultiPage() {
   300  	const (
   301  		levelsPerPage int = 100
   302  		npages        int = 3
   303  	)
   304  
   305  	p.maxDefLvl = 0
   306  	p.maxRepLvl = 0
   307  	typ := schema.NewInt32Node("a", parquet.Repetitions.Required, -1)
   308  	d := schema.NewColumn(typ, p.maxDefLvl, p.maxRepLvl)
   309  	p.pages, p.nvalues, p.values, p.defLevels, p.repLevels = makePages(p.dataPageVersion, d, npages, levelsPerPage, reflect.TypeOf(int32(0)), parquet.Encodings.Plain)
   310  	p.initReader(d)
   311  
   312  	vresult := make([]int32, levelsPerPage*npages)
   313  	dresult := make([]int16, levelsPerPage*npages)
   314  	rresult := make([]int16, levelsPerPage*npages)
   315  
   316  	rdr := p.reader.(*file.Int32ColumnChunkReader)
   317  	total, read, err := rdr.ReadBatch(int64(levelsPerPage*npages), vresult, dresult, rresult)
   318  	p.NoError(err)
   319  	p.EqualValues(levelsPerPage*npages, total)
   320  	p.EqualValues(levelsPerPage*npages, read)
   321  }
   322  
   323  func (p *PrimitiveReaderSuite) TestInt32FlatRequiredSkip() {
   324  	const (
   325  		levelsPerPage int = 100
   326  		npages        int = 5
   327  	)
   328  
   329  	p.maxDefLvl = 0
   330  	p.maxRepLvl = 0
   331  	typ := schema.NewInt32Node("a", parquet.Repetitions.Required, -1)
   332  	d := schema.NewColumn(typ, p.maxDefLvl, p.maxRepLvl)
   333  	p.pages, p.nvalues, p.values, p.defLevels, p.repLevels = makePages(p.dataPageVersion, d, npages, levelsPerPage, reflect.TypeOf(int32(0)), parquet.Encodings.Plain)
   334  	p.initReader(d)
   335  
   336  	vresult := make([]int32, levelsPerPage/2)
   337  	dresult := make([]int16, levelsPerPage/2)
   338  	rresult := make([]int16, levelsPerPage/2)
   339  
   340  	rdr := p.reader.(*file.Int32ColumnChunkReader)
   341  
   342  	p.Run("skip_size > page_size", func() {
   343  		// Skip first 2 pages
   344  		skipped, _ := rdr.Skip(int64(2 * levelsPerPage))
   345  		p.Equal(int64(2*levelsPerPage), skipped)
   346  
   347  		rdr.ReadBatch(int64(levelsPerPage/2), vresult, dresult, rresult)
   348  		subVals := p.values.Slice(2*levelsPerPage, int(2.5*float64(levelsPerPage))).Interface().([]int32)
   349  		p.Equal(subVals, vresult)
   350  	})
   351  
   352  	p.Run("skip_size == page_size", func() {
   353  		// skip across two pages
   354  		skipped, _ := rdr.Skip(int64(levelsPerPage))
   355  		p.Equal(int64(levelsPerPage), skipped)
   356  		// read half a page
   357  		rdr.ReadBatch(int64(levelsPerPage/2), vresult, dresult, rresult)
   358  		subVals := p.values.Slice(int(3.5*float64(levelsPerPage)), 4*levelsPerPage).Interface().([]int32)
   359  		p.Equal(subVals, vresult)
   360  	})
   361  
   362  	p.Run("skip_size < page_size", func() {
   363  		// skip limited to a single page
   364  		// Skip half a page
   365  		skipped, _ := rdr.Skip(int64(levelsPerPage / 2))
   366  		p.Equal(int64(0.5*float32(levelsPerPage)), skipped)
   367  		// Read half a page
   368  		rdr.ReadBatch(int64(levelsPerPage/2), vresult, dresult, rresult)
   369  		subVals := p.values.Slice(int(4.5*float64(levelsPerPage)), p.values.Len()).Interface().([]int32)
   370  		p.Equal(subVals, vresult)
   371  	})
   372  }
   373  
   374  func (p *PrimitiveReaderSuite) TestDictionaryEncodedPages() {
   375  	p.maxDefLvl = 0
   376  	p.maxRepLvl = 0
   377  	typ := schema.NewInt32Node("a", parquet.Repetitions.Required, -1)
   378  	descr := schema.NewColumn(typ, p.maxDefLvl, p.maxRepLvl)
   379  	dummy := memory.NewResizableBuffer(mem)
   380  
   381  	p.Run("Dict: Plain, Data: RLEDict", func() {
   382  		dictPage := file.NewDictionaryPage(dummy, 0, parquet.Encodings.Plain)
   383  		dataPage := testutils.MakeDataPage(p.dataPageVersion, descr, nil, 0, parquet.Encodings.RLEDict, dummy, nil, nil, 0, 0)
   384  
   385  		p.pages = append(p.pages, dictPage, dataPage)
   386  		p.initReader(descr)
   387  		p.NotPanics(func() { p.reader.HasNext() })
   388  		p.NoError(p.reader.Err())
   389  		p.pages = p.pages[:0]
   390  	})
   391  
   392  	p.Run("Dict: Plain Dictionary, Data: Plain Dictionary", func() {
   393  		dictPage := file.NewDictionaryPage(dummy, 0, parquet.Encodings.PlainDict)
   394  		dataPage := testutils.MakeDataPage(p.dataPageVersion, descr, nil, 0, parquet.Encodings.PlainDict, dummy, nil, nil, 0, 0)
   395  		p.pages = append(p.pages, dictPage, dataPage)
   396  		p.initReader(descr)
   397  		p.NotPanics(func() { p.reader.HasNext() })
   398  		p.NoError(p.reader.Err())
   399  		p.pages = p.pages[:0]
   400  	})
   401  
   402  	p.Run("Panic if dict page not first", func() {
   403  		dataPage := testutils.MakeDataPage(p.dataPageVersion, descr, nil, 0, parquet.Encodings.RLEDict, dummy, nil, nil, 0, 0)
   404  		p.pages = append(p.pages, dataPage)
   405  		p.initReader(descr)
   406  		p.NotPanics(func() { p.False(p.reader.HasNext()) })
   407  		p.Error(p.reader.Err())
   408  		p.pages = p.pages[:0]
   409  	})
   410  
   411  	p.Run("Only RLE is supported", func() {
   412  		dictPage := file.NewDictionaryPage(dummy, 0, parquet.Encodings.DeltaByteArray)
   413  		p.pages = append(p.pages, dictPage)
   414  		p.initReader(descr)
   415  		p.NotPanics(func() { p.False(p.reader.HasNext()) })
   416  		p.Error(p.reader.Err())
   417  		p.pages = p.pages[:0]
   418  	})
   419  
   420  	p.Run("Cannot have more than one dict", func() {
   421  		dictPage1 := file.NewDictionaryPage(dummy, 0, parquet.Encodings.PlainDict)
   422  		dictPage2 := file.NewDictionaryPage(dummy, 0, parquet.Encodings.Plain)
   423  		p.pages = append(p.pages, dictPage1, dictPage2)
   424  		p.initReader(descr)
   425  		p.NotPanics(func() { p.False(p.reader.HasNext()) })
   426  		p.Error(p.reader.Err())
   427  		p.pages = p.pages[:0]
   428  	})
   429  
   430  	p.Run("Unsupported encoding", func() {
   431  		dataPage := testutils.MakeDataPage(p.dataPageVersion, descr, nil, 0, parquet.Encodings.DeltaByteArray, dummy, nil, nil, 0, 0)
   432  		p.pages = append(p.pages, dataPage)
   433  		p.initReader(descr)
   434  		p.Panics(func() { p.reader.HasNext() })
   435  		// p.Error(p.reader.Err())
   436  		p.pages = p.pages[:0]
   437  	})
   438  
   439  	p.pages = p.pages[:2]
   440  }
   441  
   442  func TestPrimitiveReader(t *testing.T) {
   443  	t.Parallel()
   444  	t.Run("datapage v1", func(t *testing.T) {
   445  		suite.Run(t, new(PrimitiveReaderSuite))
   446  	})
   447  	t.Run("datapage v2", func(t *testing.T) {
   448  		suite.Run(t, &PrimitiveReaderSuite{dataPageVersion: parquet.DataPageV2})
   449  	})
   450  }