github.com/apache/arrow/go/v14@v14.0.2/parquet/internal/testutils/pagebuilder.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package testutils
    18  
    19  import (
    20  	"encoding/binary"
    21  	"fmt"
    22  	"io"
    23  	"reflect"
    24  
    25  	"github.com/apache/arrow/go/v14/arrow/memory"
    26  	"github.com/apache/arrow/go/v14/internal/utils"
    27  	"github.com/apache/arrow/go/v14/parquet"
    28  	"github.com/apache/arrow/go/v14/parquet/compress"
    29  	"github.com/apache/arrow/go/v14/parquet/file"
    30  	"github.com/apache/arrow/go/v14/parquet/internal/encoding"
    31  	"github.com/apache/arrow/go/v14/parquet/schema"
    32  	"github.com/stretchr/testify/mock"
    33  )
    34  
    35  type DataPageBuilder struct {
    36  	sink    io.Writer
    37  	version parquet.DataPageVersion
    38  
    39  	nvals          int
    40  	encoding       parquet.Encoding
    41  	defLvlEncoding parquet.Encoding
    42  	repLvlEncoding parquet.Encoding
    43  	defLvlBytesLen int
    44  	repLvlBytesLen int
    45  	hasDefLvls     bool
    46  	hasRepLvls     bool
    47  	hasValues      bool
    48  }
    49  
    50  var mem = memory.NewGoAllocator()
    51  
    52  func (d *DataPageBuilder) appendLevels(lvls []int16, maxLvl int16, e parquet.Encoding) int {
    53  	if e != parquet.Encodings.RLE {
    54  		panic("parquet: only rle encoding currently implemented")
    55  	}
    56  
    57  	buf := encoding.NewBufferWriter(encoding.LevelEncodingMaxBufferSize(e, maxLvl, len(lvls)), memory.DefaultAllocator)
    58  	var enc encoding.LevelEncoder
    59  	enc.Init(e, maxLvl, buf)
    60  	enc.Encode(lvls)
    61  
    62  	rleBytes := enc.Len()
    63  	if d.version == parquet.DataPageV1 {
    64  		if err := binary.Write(d.sink, binary.LittleEndian, int32(rleBytes)); err != nil {
    65  			panic(err)
    66  		}
    67  	}
    68  
    69  	if _, err := d.sink.Write(buf.Bytes()[:rleBytes]); err != nil {
    70  		panic(err)
    71  	}
    72  	return rleBytes
    73  }
    74  
    75  func (d *DataPageBuilder) AppendDefLevels(lvls []int16, maxLvl int16) {
    76  	d.defLvlBytesLen = d.appendLevels(lvls, maxLvl, parquet.Encodings.RLE)
    77  
    78  	d.nvals = utils.MaxInt(len(lvls), d.nvals)
    79  	d.defLvlEncoding = parquet.Encodings.RLE
    80  	d.hasDefLvls = true
    81  }
    82  
    83  func (d *DataPageBuilder) AppendRepLevels(lvls []int16, maxLvl int16) {
    84  	d.repLvlBytesLen = d.appendLevels(lvls, maxLvl, parquet.Encodings.RLE)
    85  
    86  	d.nvals = utils.MaxInt(len(lvls), d.nvals)
    87  	d.repLvlEncoding = parquet.Encodings.RLE
    88  	d.hasRepLvls = true
    89  }
    90  
    91  func (d *DataPageBuilder) AppendValues(desc *schema.Column, values interface{}, e parquet.Encoding) {
    92  	enc := encoding.NewEncoder(desc.PhysicalType(), e, false, desc, mem)
    93  	var sz int
    94  	switch v := values.(type) {
    95  	case []bool:
    96  		enc.(encoding.BooleanEncoder).Put(v)
    97  		sz = len(v)
    98  	case []int32:
    99  		enc.(encoding.Int32Encoder).Put(v)
   100  		sz = len(v)
   101  	case []int64:
   102  		enc.(encoding.Int64Encoder).Put(v)
   103  		sz = len(v)
   104  	case []parquet.Int96:
   105  		enc.(encoding.Int96Encoder).Put(v)
   106  		sz = len(v)
   107  	case []float32:
   108  		enc.(encoding.Float32Encoder).Put(v)
   109  		sz = len(v)
   110  	case []float64:
   111  		enc.(encoding.Float64Encoder).Put(v)
   112  		sz = len(v)
   113  	case []parquet.ByteArray:
   114  		enc.(encoding.ByteArrayEncoder).Put(v)
   115  		sz = len(v)
   116  	default:
   117  		panic(fmt.Sprintf("no testutil data page builder for type %T", values))
   118  	}
   119  	buf, _ := enc.FlushValues()
   120  	_, err := d.sink.Write(buf.Bytes())
   121  	if err != nil {
   122  		panic(err)
   123  	}
   124  
   125  	d.nvals = utils.MaxInt(sz, d.nvals)
   126  	d.encoding = e
   127  	d.hasValues = true
   128  }
   129  
   130  type DictionaryPageBuilder struct {
   131  	traits        encoding.DictEncoder
   132  	numDictValues int32
   133  	hasValues     bool
   134  }
   135  
   136  func NewDictionaryPageBuilder(d *schema.Column) *DictionaryPageBuilder {
   137  	return &DictionaryPageBuilder{
   138  		encoding.NewEncoder(d.PhysicalType(), parquet.Encodings.Plain, true, d, mem).(encoding.DictEncoder),
   139  		0, false}
   140  }
   141  
   142  func (d *DictionaryPageBuilder) AppendValues(values interface{}) encoding.Buffer {
   143  	switch v := values.(type) {
   144  	case []int32:
   145  		d.traits.(encoding.Int32Encoder).Put(v)
   146  	case []int64:
   147  		d.traits.(encoding.Int64Encoder).Put(v)
   148  	case []parquet.Int96:
   149  		d.traits.(encoding.Int96Encoder).Put(v)
   150  	case []float32:
   151  		d.traits.(encoding.Float32Encoder).Put(v)
   152  	case []float64:
   153  		d.traits.(encoding.Float64Encoder).Put(v)
   154  	case []parquet.ByteArray:
   155  		d.traits.(encoding.ByteArrayEncoder).Put(v)
   156  	default:
   157  		panic(fmt.Sprintf("no testutil dictionary page builder for type %T", values))
   158  	}
   159  
   160  	d.numDictValues = int32(d.traits.NumEntries())
   161  	d.hasValues = true
   162  	buf, _ := d.traits.FlushValues()
   163  	return buf
   164  }
   165  
   166  func (d *DictionaryPageBuilder) WriteDict() *memory.Buffer {
   167  	buf := memory.NewBufferBytes(make([]byte, d.traits.DictEncodedSize()))
   168  	d.traits.WriteDict(buf.Bytes())
   169  	return buf
   170  }
   171  
   172  func (d *DictionaryPageBuilder) NumValues() int32 {
   173  	return d.numDictValues
   174  }
   175  
   176  func MakeDataPage(dataPageVersion parquet.DataPageVersion, d *schema.Column, values interface{}, nvals int, e parquet.Encoding, indexBuffer encoding.Buffer, defLvls, repLvls []int16, maxDef, maxRep int16) file.Page {
   177  	num := 0
   178  
   179  	stream := encoding.NewBufferWriter(1024, mem)
   180  	builder := DataPageBuilder{sink: stream, version: dataPageVersion}
   181  
   182  	if len(repLvls) > 0 {
   183  		builder.AppendRepLevels(repLvls, maxRep)
   184  	}
   185  	if len(defLvls) > 0 {
   186  		builder.AppendDefLevels(defLvls, maxDef)
   187  	}
   188  
   189  	if e == parquet.Encodings.Plain {
   190  		builder.AppendValues(d, values, e)
   191  		num = builder.nvals
   192  	} else {
   193  		stream.Write(indexBuffer.Bytes())
   194  		num = utils.MaxInt(builder.nvals, nvals)
   195  	}
   196  
   197  	buf := stream.Finish()
   198  	if dataPageVersion == parquet.DataPageV1 {
   199  		return file.NewDataPageV1(buf, int32(num), e, builder.defLvlEncoding, builder.repLvlEncoding, int32(buf.Len()))
   200  	}
   201  	return file.NewDataPageV2(buf, int32(num), 0, int32(num), e, int32(builder.defLvlBytesLen), int32(builder.repLvlBytesLen), int32(buf.Len()), false)
   202  }
   203  
   204  func MakeDictPage(d *schema.Column, values interface{}, valuesPerPage []int, e parquet.Encoding) (*file.DictionaryPage, []encoding.Buffer) {
   205  	bldr := NewDictionaryPageBuilder(d)
   206  	npages := len(valuesPerPage)
   207  
   208  	ref := reflect.ValueOf(values)
   209  	valStart := 0
   210  
   211  	rleIndices := make([]encoding.Buffer, 0, npages)
   212  	for _, nvals := range valuesPerPage {
   213  		rleIndices = append(rleIndices, bldr.AppendValues(ref.Slice(valStart, valStart+nvals).Interface()))
   214  		valStart += nvals
   215  	}
   216  
   217  	buffer := bldr.WriteDict()
   218  	return file.NewDictionaryPage(buffer, bldr.NumValues(), parquet.Encodings.Plain), rleIndices
   219  }
   220  
   221  type MockPageReader struct {
   222  	mock.Mock
   223  
   224  	curpage int
   225  }
   226  
   227  func (m *MockPageReader) Err() error {
   228  	return m.Called().Error(0)
   229  }
   230  
   231  func (m *MockPageReader) Reset(parquet.BufferedReader, int64, compress.Compression, *file.CryptoContext) {
   232  }
   233  
   234  func (m *MockPageReader) SetMaxPageHeaderSize(int) {}
   235  
   236  func (m *MockPageReader) Page() file.Page {
   237  	return m.TestData().Get("pages").Data().([]file.Page)[m.curpage-1]
   238  }
   239  
   240  func (m *MockPageReader) Next() bool {
   241  	pageList := m.TestData().Get("pages").Data().([]file.Page)
   242  	m.curpage++
   243  	return len(pageList) >= m.curpage
   244  }
   245  
   246  func PaginatePlain(version parquet.DataPageVersion, d *schema.Column, values reflect.Value, defLevels, repLevels []int16,
   247  	maxDef, maxRep int16, lvlsPerPage int, valuesPerPage []int, enc parquet.Encoding) []file.Page {
   248  
   249  	var (
   250  		npages      = len(valuesPerPage)
   251  		defLvlStart = 0
   252  		defLvlEnd   = 0
   253  		repLvlStart = 0
   254  		repLvlEnd   = 0
   255  		valueStart  = 0
   256  	)
   257  
   258  	pageList := make([]file.Page, 0, npages)
   259  	for i := 0; i < npages; i++ {
   260  		if maxDef > 0 {
   261  			defLvlStart = i * lvlsPerPage
   262  			defLvlEnd = (i + 1) * lvlsPerPage
   263  		}
   264  		if maxRep > 0 {
   265  			repLvlStart = i * lvlsPerPage
   266  			repLvlEnd = (i + 1) * lvlsPerPage
   267  		}
   268  
   269  		page := MakeDataPage(version, d,
   270  			values.Slice(valueStart, valueStart+valuesPerPage[i]).Interface(),
   271  			valuesPerPage[i], enc, nil, defLevels[defLvlStart:defLvlEnd],
   272  			repLevels[repLvlStart:repLvlEnd], maxDef, maxRep)
   273  		valueStart += valuesPerPage[i]
   274  		pageList = append(pageList, page)
   275  	}
   276  	return pageList
   277  }
   278  
   279  func PaginateDict(version parquet.DataPageVersion, d *schema.Column, values reflect.Value, defLevels, repLevels []int16, maxDef, maxRep int16, lvlsPerPage int, valuesPerPage []int, enc parquet.Encoding) []file.Page {
   280  	var (
   281  		npages   = len(valuesPerPage)
   282  		pages    = make([]file.Page, 0, npages)
   283  		defStart = 0
   284  		defEnd   = 0
   285  		repStart = 0
   286  		repEnd   = 0
   287  	)
   288  
   289  	dictPage, rleIndices := MakeDictPage(d, values.Interface(), valuesPerPage, enc)
   290  	pages = append(pages, dictPage)
   291  	for i := 0; i < npages; i++ {
   292  		if maxDef > 0 {
   293  			defStart = i * lvlsPerPage
   294  			defEnd = (i + 1) * lvlsPerPage
   295  		}
   296  		if maxRep > 0 {
   297  			repStart = i * lvlsPerPage
   298  			repEnd = (i + 1) * lvlsPerPage
   299  		}
   300  		page := MakeDataPage(version, d, nil, valuesPerPage[i], enc, rleIndices[i],
   301  			defLevels[defStart:defEnd], repLevels[repStart:repEnd], maxDef, maxRep)
   302  		pages = append(pages, page)
   303  	}
   304  	return pages
   305  }