github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/testutils/pagebuilder.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package testutils
    18  
    19  import (
    20  	"encoding/binary"
    21  	"io"
    22  	"reflect"
    23  
    24  	"github.com/apache/arrow/go/v7/arrow/memory"
    25  	"github.com/apache/arrow/go/v7/parquet"
    26  	"github.com/apache/arrow/go/v7/parquet/compress"
    27  	"github.com/apache/arrow/go/v7/parquet/file"
    28  	"github.com/apache/arrow/go/v7/parquet/internal/encoding"
    29  	"github.com/apache/arrow/go/v7/parquet/internal/utils"
    30  	"github.com/apache/arrow/go/v7/parquet/schema"
    31  	"github.com/stretchr/testify/mock"
    32  )
    33  
    34  type DataPageBuilder struct {
    35  	sink    io.Writer
    36  	version parquet.DataPageVersion
    37  
    38  	nvals          int
    39  	encoding       parquet.Encoding
    40  	defLvlEncoding parquet.Encoding
    41  	repLvlEncoding parquet.Encoding
    42  	defLvlBytesLen int
    43  	repLvlBytesLen int
    44  	hasDefLvls     bool
    45  	hasRepLvls     bool
    46  	hasValues      bool
    47  }
    48  
    49  var mem = memory.NewGoAllocator()
    50  
    51  func (d *DataPageBuilder) appendLevels(lvls []int16, maxLvl int16, e parquet.Encoding) int {
    52  	if e != parquet.Encodings.RLE {
    53  		panic("parquet: only rle encoding currently implemented")
    54  	}
    55  
    56  	buf := encoding.NewBufferWriter(encoding.LevelEncodingMaxBufferSize(e, maxLvl, len(lvls)), memory.DefaultAllocator)
    57  	var enc encoding.LevelEncoder
    58  	enc.Init(e, maxLvl, buf)
    59  	enc.Encode(lvls)
    60  
    61  	rleBytes := enc.Len()
    62  	if d.version == parquet.DataPageV1 {
    63  		if err := binary.Write(d.sink, binary.LittleEndian, int32(rleBytes)); err != nil {
    64  			panic(err)
    65  		}
    66  	}
    67  
    68  	if _, err := d.sink.Write(buf.Bytes()[:rleBytes]); err != nil {
    69  		panic(err)
    70  	}
    71  	return rleBytes
    72  }
    73  
    74  func (d *DataPageBuilder) AppendDefLevels(lvls []int16, maxLvl int16) {
    75  	d.defLvlBytesLen = d.appendLevels(lvls, maxLvl, parquet.Encodings.RLE)
    76  
    77  	d.nvals = utils.MaxInt(len(lvls), d.nvals)
    78  	d.defLvlEncoding = parquet.Encodings.RLE
    79  	d.hasDefLvls = true
    80  }
    81  
    82  func (d *DataPageBuilder) AppendRepLevels(lvls []int16, maxLvl int16) {
    83  	d.repLvlBytesLen = d.appendLevels(lvls, maxLvl, parquet.Encodings.RLE)
    84  
    85  	d.nvals = utils.MaxInt(len(lvls), d.nvals)
    86  	d.repLvlEncoding = parquet.Encodings.RLE
    87  	d.hasRepLvls = true
    88  }
    89  
    90  func (d *DataPageBuilder) AppendValues(desc *schema.Column, values interface{}, e parquet.Encoding) {
    91  	enc := encoding.NewEncoder(desc.PhysicalType(), e, false, desc, mem)
    92  	var sz int
    93  	switch v := values.(type) {
    94  	case []int32:
    95  		enc.(encoding.Int32Encoder).Put(v)
    96  		sz = len(v)
    97  	case []int64:
    98  		enc.(encoding.Int64Encoder).Put(v)
    99  		sz = len(v)
   100  	case []parquet.Int96:
   101  		enc.(encoding.Int96Encoder).Put(v)
   102  		sz = len(v)
   103  	case []float32:
   104  		enc.(encoding.Float32Encoder).Put(v)
   105  		sz = len(v)
   106  	case []float64:
   107  		enc.(encoding.Float64Encoder).Put(v)
   108  		sz = len(v)
   109  	case []parquet.ByteArray:
   110  		enc.(encoding.ByteArrayEncoder).Put(v)
   111  		sz = len(v)
   112  	}
   113  	buf, _ := enc.FlushValues()
   114  	_, err := d.sink.Write(buf.Bytes())
   115  	if err != nil {
   116  		panic(err)
   117  	}
   118  
   119  	d.nvals = utils.MaxInt(sz, d.nvals)
   120  	d.encoding = e
   121  	d.hasValues = true
   122  }
   123  
   124  type DictionaryPageBuilder struct {
   125  	traits        encoding.DictEncoder
   126  	numDictValues int32
   127  	hasValues     bool
   128  }
   129  
   130  func NewDictionaryPageBuilder(d *schema.Column) *DictionaryPageBuilder {
   131  	return &DictionaryPageBuilder{
   132  		encoding.NewEncoder(d.PhysicalType(), parquet.Encodings.Plain, true, d, mem).(encoding.DictEncoder),
   133  		0, false}
   134  }
   135  
   136  func (d *DictionaryPageBuilder) AppendValues(values interface{}) encoding.Buffer {
   137  	switch v := values.(type) {
   138  	case []int32:
   139  		d.traits.(encoding.Int32Encoder).Put(v)
   140  	case []int64:
   141  		d.traits.(encoding.Int64Encoder).Put(v)
   142  	case []parquet.Int96:
   143  		d.traits.(encoding.Int96Encoder).Put(v)
   144  	case []float32:
   145  		d.traits.(encoding.Float32Encoder).Put(v)
   146  	case []float64:
   147  		d.traits.(encoding.Float64Encoder).Put(v)
   148  	case []parquet.ByteArray:
   149  		d.traits.(encoding.ByteArrayEncoder).Put(v)
   150  	}
   151  
   152  	d.numDictValues = int32(d.traits.NumEntries())
   153  	d.hasValues = true
   154  	buf, _ := d.traits.FlushValues()
   155  	return buf
   156  }
   157  
   158  func (d *DictionaryPageBuilder) WriteDict() *memory.Buffer {
   159  	buf := memory.NewBufferBytes(make([]byte, d.traits.DictEncodedSize()))
   160  	d.traits.WriteDict(buf.Bytes())
   161  	return buf
   162  }
   163  
   164  func (d *DictionaryPageBuilder) NumValues() int32 {
   165  	return d.numDictValues
   166  }
   167  
   168  func MakeDataPage(dataPageVersion parquet.DataPageVersion, d *schema.Column, values interface{}, nvals int, e parquet.Encoding, indexBuffer encoding.Buffer, defLvls, repLvls []int16, maxDef, maxRep int16) file.Page {
   169  	num := 0
   170  
   171  	stream := encoding.NewBufferWriter(1024, mem)
   172  	builder := DataPageBuilder{sink: stream, version: dataPageVersion}
   173  
   174  	if len(repLvls) > 0 {
   175  		builder.AppendRepLevels(repLvls, maxRep)
   176  	}
   177  	if len(defLvls) > 0 {
   178  		builder.AppendDefLevels(defLvls, maxDef)
   179  	}
   180  
   181  	if e == parquet.Encodings.Plain {
   182  		builder.AppendValues(d, values, e)
   183  		num = builder.nvals
   184  	} else {
   185  		stream.Write(indexBuffer.Bytes())
   186  		num = utils.MaxInt(builder.nvals, nvals)
   187  	}
   188  
   189  	buf := stream.Finish()
   190  	if dataPageVersion == parquet.DataPageV1 {
   191  		return file.NewDataPageV1(buf, int32(num), e, builder.defLvlEncoding, builder.repLvlEncoding, int32(buf.Len()))
   192  	}
   193  	return file.NewDataPageV2(buf, int32(num), 0, int32(num), e, int32(builder.defLvlBytesLen), int32(builder.repLvlBytesLen), int32(buf.Len()), false)
   194  }
   195  
   196  func MakeDictPage(d *schema.Column, values interface{}, valuesPerPage []int, e parquet.Encoding) (*file.DictionaryPage, []encoding.Buffer) {
   197  	bldr := NewDictionaryPageBuilder(d)
   198  	npages := len(valuesPerPage)
   199  
   200  	ref := reflect.ValueOf(values)
   201  	valStart := 0
   202  
   203  	rleIndices := make([]encoding.Buffer, 0, npages)
   204  	for _, nvals := range valuesPerPage {
   205  		rleIndices = append(rleIndices, bldr.AppendValues(ref.Slice(valStart, valStart+nvals).Interface()))
   206  		valStart += nvals
   207  	}
   208  
   209  	buffer := bldr.WriteDict()
   210  	return file.NewDictionaryPage(buffer, bldr.NumValues(), parquet.Encodings.Plain), rleIndices
   211  }
   212  
   213  type MockPageReader struct {
   214  	mock.Mock
   215  
   216  	curpage int
   217  }
   218  
   219  func (m *MockPageReader) Err() error {
   220  	return m.Called().Error(0)
   221  }
   222  
   223  func (m *MockPageReader) Reset(parquet.ReaderAtSeeker, int64, compress.Compression, *file.CryptoContext) {
   224  }
   225  
   226  func (m *MockPageReader) SetMaxPageHeaderSize(int) {}
   227  
   228  func (m *MockPageReader) Page() file.Page {
   229  	return m.TestData().Get("pages").Data().([]file.Page)[m.curpage-1]
   230  }
   231  
   232  func (m *MockPageReader) Next() bool {
   233  	pageList := m.TestData().Get("pages").Data().([]file.Page)
   234  	m.curpage++
   235  	return len(pageList) >= m.curpage
   236  }
   237  
   238  func PaginatePlain(version parquet.DataPageVersion, d *schema.Column, values reflect.Value, defLevels, repLevels []int16,
   239  	maxDef, maxRep int16, lvlsPerPage int, valuesPerPage []int, enc parquet.Encoding) []file.Page {
   240  
   241  	var (
   242  		npages      = len(valuesPerPage)
   243  		defLvlStart = 0
   244  		defLvlEnd   = 0
   245  		repLvlStart = 0
   246  		repLvlEnd   = 0
   247  		valueStart  = 0
   248  	)
   249  
   250  	pageList := make([]file.Page, 0, npages)
   251  	for i := 0; i < npages; i++ {
   252  		if maxDef > 0 {
   253  			defLvlStart = i * lvlsPerPage
   254  			defLvlEnd = (i + 1) * lvlsPerPage
   255  		}
   256  		if maxRep > 0 {
   257  			repLvlStart = i * lvlsPerPage
   258  			repLvlEnd = (i + 1) * lvlsPerPage
   259  		}
   260  
   261  		page := MakeDataPage(version, d,
   262  			values.Slice(valueStart, valueStart+valuesPerPage[i]).Interface(),
   263  			valuesPerPage[i], enc, nil, defLevels[defLvlStart:defLvlEnd],
   264  			repLevels[repLvlStart:repLvlEnd], maxDef, maxRep)
   265  		valueStart += valuesPerPage[i]
   266  		pageList = append(pageList, page)
   267  	}
   268  	return pageList
   269  }
   270  
   271  func PaginateDict(version parquet.DataPageVersion, d *schema.Column, values reflect.Value, defLevels, repLevels []int16, maxDef, maxRep int16, lvlsPerPage int, valuesPerPage []int, enc parquet.Encoding) []file.Page {
   272  	var (
   273  		npages   = len(valuesPerPage)
   274  		pages    = make([]file.Page, 0, npages)
   275  		defStart = 0
   276  		defEnd   = 0
   277  		repStart = 0
   278  		repEnd   = 0
   279  	)
   280  
   281  	dictPage, rleIndices := MakeDictPage(d, values.Interface(), valuesPerPage, enc)
   282  	pages = append(pages, dictPage)
   283  	for i := 0; i < npages; i++ {
   284  		if maxDef > 0 {
   285  			defStart = i * lvlsPerPage
   286  			defEnd = (i + 1) * lvlsPerPage
   287  		}
   288  		if maxRep > 0 {
   289  			repStart = i * lvlsPerPage
   290  			repEnd = (i + 1) * lvlsPerPage
   291  		}
   292  		page := MakeDataPage(version, d, nil, valuesPerPage[i], enc, rleIndices[i],
   293  			defLevels[defStart:defEnd], repLevels[repStart:repEnd], maxDef, maxRep)
   294  		pages = append(pages, page)
   295  	}
   296  	return pages
   297  }