github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/testutils/pagebuilder.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package testutils 18 19 import ( 20 "encoding/binary" 21 "io" 22 "reflect" 23 24 "github.com/apache/arrow/go/v7/arrow/memory" 25 "github.com/apache/arrow/go/v7/parquet" 26 "github.com/apache/arrow/go/v7/parquet/compress" 27 "github.com/apache/arrow/go/v7/parquet/file" 28 "github.com/apache/arrow/go/v7/parquet/internal/encoding" 29 "github.com/apache/arrow/go/v7/parquet/internal/utils" 30 "github.com/apache/arrow/go/v7/parquet/schema" 31 "github.com/stretchr/testify/mock" 32 ) 33 34 type DataPageBuilder struct { 35 sink io.Writer 36 version parquet.DataPageVersion 37 38 nvals int 39 encoding parquet.Encoding 40 defLvlEncoding parquet.Encoding 41 repLvlEncoding parquet.Encoding 42 defLvlBytesLen int 43 repLvlBytesLen int 44 hasDefLvls bool 45 hasRepLvls bool 46 hasValues bool 47 } 48 49 var mem = memory.NewGoAllocator() 50 51 func (d *DataPageBuilder) appendLevels(lvls []int16, maxLvl int16, e parquet.Encoding) int { 52 if e != parquet.Encodings.RLE { 53 panic("parquet: only rle encoding currently implemented") 54 } 55 56 buf := encoding.NewBufferWriter(encoding.LevelEncodingMaxBufferSize(e, maxLvl, len(lvls)), memory.DefaultAllocator) 57 var enc encoding.LevelEncoder 58 enc.Init(e, maxLvl, buf) 59 enc.Encode(lvls) 60 61 rleBytes := enc.Len() 62 if d.version == parquet.DataPageV1 { 63 if err := binary.Write(d.sink, binary.LittleEndian, int32(rleBytes)); err != nil { 64 panic(err) 65 } 66 } 67 68 if _, err := d.sink.Write(buf.Bytes()[:rleBytes]); err != nil { 69 panic(err) 70 } 71 return rleBytes 72 } 73 74 func (d *DataPageBuilder) AppendDefLevels(lvls []int16, maxLvl int16) { 75 d.defLvlBytesLen = d.appendLevels(lvls, maxLvl, parquet.Encodings.RLE) 76 77 d.nvals = utils.MaxInt(len(lvls), d.nvals) 78 d.defLvlEncoding = parquet.Encodings.RLE 79 d.hasDefLvls = true 80 } 81 82 func (d *DataPageBuilder) AppendRepLevels(lvls []int16, maxLvl int16) { 83 d.repLvlBytesLen = d.appendLevels(lvls, maxLvl, parquet.Encodings.RLE) 84 85 d.nvals = utils.MaxInt(len(lvls), d.nvals) 86 d.repLvlEncoding = parquet.Encodings.RLE 87 d.hasRepLvls = true 88 } 89 90 func (d *DataPageBuilder) AppendValues(desc *schema.Column, values interface{}, e parquet.Encoding) { 91 enc := encoding.NewEncoder(desc.PhysicalType(), e, false, desc, mem) 92 var sz int 93 switch v := values.(type) { 94 case []int32: 95 enc.(encoding.Int32Encoder).Put(v) 96 sz = len(v) 97 case []int64: 98 enc.(encoding.Int64Encoder).Put(v) 99 sz = len(v) 100 case []parquet.Int96: 101 enc.(encoding.Int96Encoder).Put(v) 102 sz = len(v) 103 case []float32: 104 enc.(encoding.Float32Encoder).Put(v) 105 sz = len(v) 106 case []float64: 107 enc.(encoding.Float64Encoder).Put(v) 108 sz = len(v) 109 case []parquet.ByteArray: 110 enc.(encoding.ByteArrayEncoder).Put(v) 111 sz = len(v) 112 } 113 buf, _ := enc.FlushValues() 114 _, err := d.sink.Write(buf.Bytes()) 115 if err != nil { 116 panic(err) 117 } 118 119 d.nvals = utils.MaxInt(sz, d.nvals) 120 d.encoding = e 121 d.hasValues = true 122 } 123 124 type DictionaryPageBuilder struct { 125 traits encoding.DictEncoder 126 numDictValues int32 127 hasValues bool 128 } 129 130 func NewDictionaryPageBuilder(d *schema.Column) *DictionaryPageBuilder { 131 return &DictionaryPageBuilder{ 132 encoding.NewEncoder(d.PhysicalType(), parquet.Encodings.Plain, true, d, mem).(encoding.DictEncoder), 133 0, false} 134 } 135 136 func (d *DictionaryPageBuilder) AppendValues(values interface{}) encoding.Buffer { 137 switch v := values.(type) { 138 case []int32: 139 d.traits.(encoding.Int32Encoder).Put(v) 140 case []int64: 141 d.traits.(encoding.Int64Encoder).Put(v) 142 case []parquet.Int96: 143 d.traits.(encoding.Int96Encoder).Put(v) 144 case []float32: 145 d.traits.(encoding.Float32Encoder).Put(v) 146 case []float64: 147 d.traits.(encoding.Float64Encoder).Put(v) 148 case []parquet.ByteArray: 149 d.traits.(encoding.ByteArrayEncoder).Put(v) 150 } 151 152 d.numDictValues = int32(d.traits.NumEntries()) 153 d.hasValues = true 154 buf, _ := d.traits.FlushValues() 155 return buf 156 } 157 158 func (d *DictionaryPageBuilder) WriteDict() *memory.Buffer { 159 buf := memory.NewBufferBytes(make([]byte, d.traits.DictEncodedSize())) 160 d.traits.WriteDict(buf.Bytes()) 161 return buf 162 } 163 164 func (d *DictionaryPageBuilder) NumValues() int32 { 165 return d.numDictValues 166 } 167 168 func MakeDataPage(dataPageVersion parquet.DataPageVersion, d *schema.Column, values interface{}, nvals int, e parquet.Encoding, indexBuffer encoding.Buffer, defLvls, repLvls []int16, maxDef, maxRep int16) file.Page { 169 num := 0 170 171 stream := encoding.NewBufferWriter(1024, mem) 172 builder := DataPageBuilder{sink: stream, version: dataPageVersion} 173 174 if len(repLvls) > 0 { 175 builder.AppendRepLevels(repLvls, maxRep) 176 } 177 if len(defLvls) > 0 { 178 builder.AppendDefLevels(defLvls, maxDef) 179 } 180 181 if e == parquet.Encodings.Plain { 182 builder.AppendValues(d, values, e) 183 num = builder.nvals 184 } else { 185 stream.Write(indexBuffer.Bytes()) 186 num = utils.MaxInt(builder.nvals, nvals) 187 } 188 189 buf := stream.Finish() 190 if dataPageVersion == parquet.DataPageV1 { 191 return file.NewDataPageV1(buf, int32(num), e, builder.defLvlEncoding, builder.repLvlEncoding, int32(buf.Len())) 192 } 193 return file.NewDataPageV2(buf, int32(num), 0, int32(num), e, int32(builder.defLvlBytesLen), int32(builder.repLvlBytesLen), int32(buf.Len()), false) 194 } 195 196 func MakeDictPage(d *schema.Column, values interface{}, valuesPerPage []int, e parquet.Encoding) (*file.DictionaryPage, []encoding.Buffer) { 197 bldr := NewDictionaryPageBuilder(d) 198 npages := len(valuesPerPage) 199 200 ref := reflect.ValueOf(values) 201 valStart := 0 202 203 rleIndices := make([]encoding.Buffer, 0, npages) 204 for _, nvals := range valuesPerPage { 205 rleIndices = append(rleIndices, bldr.AppendValues(ref.Slice(valStart, valStart+nvals).Interface())) 206 valStart += nvals 207 } 208 209 buffer := bldr.WriteDict() 210 return file.NewDictionaryPage(buffer, bldr.NumValues(), parquet.Encodings.Plain), rleIndices 211 } 212 213 type MockPageReader struct { 214 mock.Mock 215 216 curpage int 217 } 218 219 func (m *MockPageReader) Err() error { 220 return m.Called().Error(0) 221 } 222 223 func (m *MockPageReader) Reset(parquet.ReaderAtSeeker, int64, compress.Compression, *file.CryptoContext) { 224 } 225 226 func (m *MockPageReader) SetMaxPageHeaderSize(int) {} 227 228 func (m *MockPageReader) Page() file.Page { 229 return m.TestData().Get("pages").Data().([]file.Page)[m.curpage-1] 230 } 231 232 func (m *MockPageReader) Next() bool { 233 pageList := m.TestData().Get("pages").Data().([]file.Page) 234 m.curpage++ 235 return len(pageList) >= m.curpage 236 } 237 238 func PaginatePlain(version parquet.DataPageVersion, d *schema.Column, values reflect.Value, defLevels, repLevels []int16, 239 maxDef, maxRep int16, lvlsPerPage int, valuesPerPage []int, enc parquet.Encoding) []file.Page { 240 241 var ( 242 npages = len(valuesPerPage) 243 defLvlStart = 0 244 defLvlEnd = 0 245 repLvlStart = 0 246 repLvlEnd = 0 247 valueStart = 0 248 ) 249 250 pageList := make([]file.Page, 0, npages) 251 for i := 0; i < npages; i++ { 252 if maxDef > 0 { 253 defLvlStart = i * lvlsPerPage 254 defLvlEnd = (i + 1) * lvlsPerPage 255 } 256 if maxRep > 0 { 257 repLvlStart = i * lvlsPerPage 258 repLvlEnd = (i + 1) * lvlsPerPage 259 } 260 261 page := MakeDataPage(version, d, 262 values.Slice(valueStart, valueStart+valuesPerPage[i]).Interface(), 263 valuesPerPage[i], enc, nil, defLevels[defLvlStart:defLvlEnd], 264 repLevels[repLvlStart:repLvlEnd], maxDef, maxRep) 265 valueStart += valuesPerPage[i] 266 pageList = append(pageList, page) 267 } 268 return pageList 269 } 270 271 func PaginateDict(version parquet.DataPageVersion, d *schema.Column, values reflect.Value, defLevels, repLevels []int16, maxDef, maxRep int16, lvlsPerPage int, valuesPerPage []int, enc parquet.Encoding) []file.Page { 272 var ( 273 npages = len(valuesPerPage) 274 pages = make([]file.Page, 0, npages) 275 defStart = 0 276 defEnd = 0 277 repStart = 0 278 repEnd = 0 279 ) 280 281 dictPage, rleIndices := MakeDictPage(d, values.Interface(), valuesPerPage, enc) 282 pages = append(pages, dictPage) 283 for i := 0; i < npages; i++ { 284 if maxDef > 0 { 285 defStart = i * lvlsPerPage 286 defEnd = (i + 1) * lvlsPerPage 287 } 288 if maxRep > 0 { 289 repStart = i * lvlsPerPage 290 repEnd = (i + 1) * lvlsPerPage 291 } 292 page := MakeDataPage(version, d, nil, valuesPerPage[i], enc, rleIndices[i], 293 defLevels[defStart:defEnd], repLevels[repStart:repEnd], maxDef, maxRep) 294 pages = append(pages, page) 295 } 296 return pages 297 }