github.com/fraugster/parquet-go@v0.12.0/type_bytearray.go (about) 1 package goparquet 2 3 import ( 4 "bytes" 5 "encoding/binary" 6 "errors" 7 "fmt" 8 "io" 9 10 "github.com/fraugster/parquet-go/parquet" 11 ) 12 13 type byteArrayPlainDecoder struct { 14 r io.Reader 15 // if the length is set, then this is a fix size array decoder, unless it reads the len first 16 length int 17 } 18 19 func (b *byteArrayPlainDecoder) init(r io.Reader) error { 20 b.r = r 21 return nil 22 } 23 24 func (b *byteArrayPlainDecoder) next() ([]byte, error) { 25 var l = int32(b.length) 26 if l == 0 { 27 if err := binary.Read(b.r, binary.LittleEndian, &l); err != nil { 28 return nil, err 29 } 30 31 if l < 0 { 32 return nil, errors.New("bytearray/plain: len is negative") 33 } 34 } else if l < 0 { 35 return nil, errors.New("bytearray/plain: len is negative") 36 } 37 38 buf := make([]byte, l) 39 _, err := io.ReadFull(b.r, buf) 40 if err != nil { 41 return nil, err 42 } 43 44 return buf, nil 45 } 46 47 func (b *byteArrayPlainDecoder) decodeValues(dst []interface{}) (int, error) { 48 var err error 49 for i := range dst { 50 if dst[i], err = b.next(); err != nil { 51 return i, err 52 } 53 } 54 return len(dst), nil 55 } 56 57 type byteArrayPlainEncoder struct { 58 w io.Writer 59 60 length int 61 } 62 63 func (b *byteArrayPlainEncoder) init(w io.Writer) error { 64 b.w = w 65 66 return nil 67 } 68 69 func (b *byteArrayPlainEncoder) writeBytes(data []byte) error { 70 l := b.length 71 if l == 0 { // variable length 72 l = len(data) 73 l32 := int32(l) 74 if err := binary.Write(b.w, binary.LittleEndian, l32); err != nil { 75 return err 76 } 77 } else if len(data) != l { 78 return fmt.Errorf("the byte array should be with length %d but is %d", l, len(data)) 79 } 80 81 return writeFull(b.w, data) 82 } 83 84 func (b *byteArrayPlainEncoder) encodeValues(values []interface{}) error { 85 for i := range values { 86 if err := b.writeBytes(values[i].([]byte)); err != nil { 87 return err 88 } 89 } 90 91 return nil 92 } 93 94 func (*byteArrayPlainEncoder) Close() error { 95 return nil 96 } 97 98 type byteArrayDeltaLengthDecoder struct { 99 r io.Reader 100 position int 101 lens []int32 102 } 103 104 func (b *byteArrayDeltaLengthDecoder) init(r io.Reader) error { 105 b.r = r 106 b.position = 0 107 lensDecoder := int32DeltaBPDecoder{} 108 if err := lensDecoder.init(r); err != nil { 109 return err 110 } 111 112 b.lens = make([]int32, lensDecoder.valuesCount) 113 return decodeInt32(&lensDecoder, b.lens) 114 } 115 116 func (b *byteArrayDeltaLengthDecoder) next() ([]byte, error) { 117 if b.position >= len(b.lens) { 118 return nil, io.EOF 119 } 120 size := int(b.lens[b.position]) 121 value := make([]byte, size) 122 if _, err := io.ReadFull(b.r, value); err != nil { 123 return nil, fmt.Errorf("there is no byte left: %w", err) 124 } 125 b.position++ 126 127 return value, nil 128 } 129 130 func (b *byteArrayDeltaLengthDecoder) decodeValues(dst []interface{}) (int, error) { 131 total := len(dst) 132 for i := 0; i < total; i++ { 133 v, err := b.next() 134 if err != nil { 135 return i, err 136 } 137 dst[i] = v 138 } 139 return total, nil 140 } 141 142 // this type is used inside the byteArrayDeltaEncoder, the Close method should do the actual write, not before. 143 type byteArrayDeltaLengthEncoder struct { 144 w io.Writer 145 buf *bytes.Buffer 146 lens []interface{} 147 } 148 149 func (b *byteArrayDeltaLengthEncoder) init(w io.Writer) error { 150 b.w = w 151 b.buf = &bytes.Buffer{} 152 return nil 153 } 154 155 func (b *byteArrayDeltaLengthEncoder) writeOne(data []byte) error { 156 b.lens = append(b.lens, int32(len(data))) 157 return writeFull(b.buf, data) 158 } 159 160 func (b *byteArrayDeltaLengthEncoder) encodeValues(values []interface{}) error { 161 if b.lens == nil { 162 // this is just for the first time, maybe we need to copy and increase the cap in the next calls? 163 b.lens = make([]interface{}, 0, len(values)) 164 } 165 for i := range values { 166 if err := b.writeOne(values[i].([]byte)); err != nil { 167 return err 168 } 169 } 170 171 return nil 172 } 173 174 func (b *byteArrayDeltaLengthEncoder) Close() error { 175 enc := &int32DeltaBPEncoder{ 176 deltaBitPackEncoder32: deltaBitPackEncoder32{ 177 blockSize: 128, 178 miniBlockCount: 4, 179 }, 180 } 181 182 if err := encodeValue(b.w, enc, b.lens); err != nil { 183 return err 184 } 185 186 return writeFull(b.w, b.buf.Bytes()) 187 } 188 189 type byteArrayDeltaDecoder struct { 190 suffixDecoder byteArrayDeltaLengthDecoder 191 prefixLens []int32 192 previousValue []byte 193 } 194 195 func (d *byteArrayDeltaDecoder) init(r io.Reader) error { 196 lensDecoder := deltaBitPackDecoder32{} 197 if err := lensDecoder.init(r); err != nil { 198 return err 199 } 200 201 d.prefixLens = make([]int32, lensDecoder.valuesCount) 202 if err := decodeInt32(&lensDecoder, d.prefixLens); err != nil { 203 return err 204 } 205 if err := d.suffixDecoder.init(r); err != nil { 206 return err 207 } 208 209 if len(d.prefixLens) != len(d.suffixDecoder.lens) { 210 return errors.New("bytearray/delta: different number of suffixes and prefixes") 211 } 212 d.previousValue = make([]byte, 0) 213 214 return nil 215 } 216 217 func (d *byteArrayDeltaDecoder) decodeValues(dst []interface{}) (int, error) { 218 total := len(dst) 219 for i := 0; i < total; i++ { 220 suffix, err := d.suffixDecoder.next() 221 if err != nil { 222 return i, err 223 } 224 // after this line no error is acceptable 225 prefixLen := int(d.prefixLens[d.suffixDecoder.position-1]) 226 value := make([]byte, 0, prefixLen+len(suffix)) 227 if len(d.previousValue) < prefixLen { 228 // prevent panic from invalid input 229 return 0, fmt.Errorf("invalid prefix len in the stream, the value is %d byte but the it needs %d byte", len(d.previousValue), prefixLen) 230 } 231 if prefixLen > 0 { 232 value = append(value, d.previousValue[:prefixLen]...) 233 } 234 value = append(value, suffix...) 235 d.previousValue = value 236 dst[i] = value 237 } 238 239 return total, nil 240 } 241 242 type byteArrayDeltaEncoder struct { 243 w io.Writer 244 245 prefixLens []interface{} 246 previousValue []byte 247 248 values *byteArrayDeltaLengthEncoder 249 } 250 251 func (b *byteArrayDeltaEncoder) init(w io.Writer) error { 252 b.w = w 253 b.prefixLens = nil 254 b.previousValue = []byte{} 255 b.values = &byteArrayDeltaLengthEncoder{} 256 return b.values.init(w) 257 } 258 259 func (b *byteArrayDeltaEncoder) encodeValues(values []interface{}) error { 260 if b.prefixLens == nil { 261 b.prefixLens = make([]interface{}, 0, len(values)) 262 b.values.lens = make([]interface{}, 0, len(values)) 263 } 264 265 for i := range values { 266 data := values[i].([]byte) 267 pLen := prefix(b.previousValue, data) 268 b.prefixLens = append(b.prefixLens, int32(pLen)) 269 if err := b.values.writeOne(data[pLen:]); err != nil { 270 return err 271 } 272 b.previousValue = data 273 } 274 275 return nil 276 } 277 278 func (b *byteArrayDeltaEncoder) Close() error { 279 // write the lens first 280 enc := &int32DeltaBPEncoder{ 281 deltaBitPackEncoder32: deltaBitPackEncoder32{ 282 blockSize: 128, 283 miniBlockCount: 4, 284 }, 285 } 286 287 if err := encodeValue(b.w, enc, b.prefixLens); err != nil { 288 return err 289 } 290 291 return b.values.Close() 292 } 293 294 type byteArrayStore struct { 295 repTyp parquet.FieldRepetitionType 296 stats statistics 297 pageStats statistics 298 299 *ColumnParameters 300 } 301 302 func (is *byteArrayStore) getStats() minMaxValues { 303 return &is.stats 304 } 305 306 func (is *byteArrayStore) getPageStats() minMaxValues { 307 return &is.pageStats 308 } 309 310 func (is *byteArrayStore) params() *ColumnParameters { 311 if is.ColumnParameters == nil { 312 panic("ColumnParameters is nil") 313 } 314 return is.ColumnParameters 315 } 316 317 func (is *byteArrayStore) sizeOf(v interface{}) int { 318 if vv, ok := v.([][]byte); ok { 319 l := 0 320 for _, vvv := range vv { 321 l += len(vvv) 322 } 323 return l 324 } 325 return len(v.([]byte)) 326 } 327 328 func (is *byteArrayStore) parquetType() parquet.Type { 329 if is.TypeLength != nil && *is.TypeLength > 0 { 330 return parquet.Type_FIXED_LEN_BYTE_ARRAY 331 } 332 return parquet.Type_BYTE_ARRAY 333 } 334 335 func (is *byteArrayStore) repetitionType() parquet.FieldRepetitionType { 336 return is.repTyp 337 } 338 339 func (is *byteArrayStore) reset(repetitionType parquet.FieldRepetitionType) { 340 is.repTyp = repetitionType 341 342 is.stats.reset() 343 is.pageStats.reset() 344 } 345 346 func (is *byteArrayStore) setMinMax(j []byte) error { 347 if is.TypeLength != nil && *is.TypeLength > 0 && int32(len(j)) != *is.TypeLength { 348 return fmt.Errorf("the size of data should be %d but is %d", *is.TypeLength, len(j)) 349 } 350 // For nil value there is no need to set the min/max 351 if j == nil { 352 return nil 353 } 354 355 is.stats.setMinMax(j) 356 is.pageStats.setMinMax(j) 357 358 return nil 359 } 360 361 func (is *byteArrayStore) getValues(v interface{}) ([]interface{}, error) { 362 var vals []interface{} 363 switch typed := v.(type) { 364 case []byte: 365 vals = []interface{}{typed} 366 case [][]byte: 367 if is.repTyp != parquet.FieldRepetitionType_REPEATED { 368 return nil, fmt.Errorf("the value is not repeated but it is an array") 369 } 370 vals = make([]interface{}, len(typed)) 371 for j := range typed { 372 vals[j] = typed[j] 373 } 374 case string: 375 vals = []interface{}{[]byte(typed)} 376 case []string: 377 if is.repTyp != parquet.FieldRepetitionType_REPEATED { 378 return nil, fmt.Errorf("the value is not repeated but it is an array") 379 } 380 vals = make([]interface{}, len(typed)) 381 for j := range typed { 382 vals[j] = []byte(typed[j]) 383 } 384 default: 385 return nil, fmt.Errorf("unsupported type for storing in []byte column %T => %+v", v, v) 386 } 387 388 return vals, nil 389 } 390 391 func (*byteArrayStore) append(arrayIn interface{}, value interface{}) interface{} { 392 if arrayIn == nil { 393 arrayIn = make([][]byte, 0, 1) 394 } 395 return append(arrayIn.([][]byte), value.([]byte)) 396 }