github.com/fraugster/parquet-go@v0.12.0/type_bytearray.go (about)

     1  package goparquet
     2  
     3  import (
     4  	"bytes"
     5  	"encoding/binary"
     6  	"errors"
     7  	"fmt"
     8  	"io"
     9  
    10  	"github.com/fraugster/parquet-go/parquet"
    11  )
    12  
    13  type byteArrayPlainDecoder struct {
    14  	r io.Reader
    15  	// if the length is set, then this is a fix size array decoder, unless it reads the len first
    16  	length int
    17  }
    18  
    19  func (b *byteArrayPlainDecoder) init(r io.Reader) error {
    20  	b.r = r
    21  	return nil
    22  }
    23  
    24  func (b *byteArrayPlainDecoder) next() ([]byte, error) {
    25  	var l = int32(b.length)
    26  	if l == 0 {
    27  		if err := binary.Read(b.r, binary.LittleEndian, &l); err != nil {
    28  			return nil, err
    29  		}
    30  
    31  		if l < 0 {
    32  			return nil, errors.New("bytearray/plain: len is negative")
    33  		}
    34  	} else if l < 0 {
    35  		return nil, errors.New("bytearray/plain: len is negative")
    36  	}
    37  
    38  	buf := make([]byte, l)
    39  	_, err := io.ReadFull(b.r, buf)
    40  	if err != nil {
    41  		return nil, err
    42  	}
    43  
    44  	return buf, nil
    45  }
    46  
    47  func (b *byteArrayPlainDecoder) decodeValues(dst []interface{}) (int, error) {
    48  	var err error
    49  	for i := range dst {
    50  		if dst[i], err = b.next(); err != nil {
    51  			return i, err
    52  		}
    53  	}
    54  	return len(dst), nil
    55  }
    56  
    57  type byteArrayPlainEncoder struct {
    58  	w io.Writer
    59  
    60  	length int
    61  }
    62  
    63  func (b *byteArrayPlainEncoder) init(w io.Writer) error {
    64  	b.w = w
    65  
    66  	return nil
    67  }
    68  
    69  func (b *byteArrayPlainEncoder) writeBytes(data []byte) error {
    70  	l := b.length
    71  	if l == 0 { // variable length
    72  		l = len(data)
    73  		l32 := int32(l)
    74  		if err := binary.Write(b.w, binary.LittleEndian, l32); err != nil {
    75  			return err
    76  		}
    77  	} else if len(data) != l {
    78  		return fmt.Errorf("the byte array should be with length %d but is %d", l, len(data))
    79  	}
    80  
    81  	return writeFull(b.w, data)
    82  }
    83  
    84  func (b *byteArrayPlainEncoder) encodeValues(values []interface{}) error {
    85  	for i := range values {
    86  		if err := b.writeBytes(values[i].([]byte)); err != nil {
    87  			return err
    88  		}
    89  	}
    90  
    91  	return nil
    92  }
    93  
    94  func (*byteArrayPlainEncoder) Close() error {
    95  	return nil
    96  }
    97  
    98  type byteArrayDeltaLengthDecoder struct {
    99  	r        io.Reader
   100  	position int
   101  	lens     []int32
   102  }
   103  
   104  func (b *byteArrayDeltaLengthDecoder) init(r io.Reader) error {
   105  	b.r = r
   106  	b.position = 0
   107  	lensDecoder := int32DeltaBPDecoder{}
   108  	if err := lensDecoder.init(r); err != nil {
   109  		return err
   110  	}
   111  
   112  	b.lens = make([]int32, lensDecoder.valuesCount)
   113  	return decodeInt32(&lensDecoder, b.lens)
   114  }
   115  
   116  func (b *byteArrayDeltaLengthDecoder) next() ([]byte, error) {
   117  	if b.position >= len(b.lens) {
   118  		return nil, io.EOF
   119  	}
   120  	size := int(b.lens[b.position])
   121  	value := make([]byte, size)
   122  	if _, err := io.ReadFull(b.r, value); err != nil {
   123  		return nil, fmt.Errorf("there is no byte left: %w", err)
   124  	}
   125  	b.position++
   126  
   127  	return value, nil
   128  }
   129  
   130  func (b *byteArrayDeltaLengthDecoder) decodeValues(dst []interface{}) (int, error) {
   131  	total := len(dst)
   132  	for i := 0; i < total; i++ {
   133  		v, err := b.next()
   134  		if err != nil {
   135  			return i, err
   136  		}
   137  		dst[i] = v
   138  	}
   139  	return total, nil
   140  }
   141  
   142  // this type is used inside the byteArrayDeltaEncoder, the Close method should do the actual write, not before.
   143  type byteArrayDeltaLengthEncoder struct {
   144  	w    io.Writer
   145  	buf  *bytes.Buffer
   146  	lens []interface{}
   147  }
   148  
   149  func (b *byteArrayDeltaLengthEncoder) init(w io.Writer) error {
   150  	b.w = w
   151  	b.buf = &bytes.Buffer{}
   152  	return nil
   153  }
   154  
   155  func (b *byteArrayDeltaLengthEncoder) writeOne(data []byte) error {
   156  	b.lens = append(b.lens, int32(len(data)))
   157  	return writeFull(b.buf, data)
   158  }
   159  
   160  func (b *byteArrayDeltaLengthEncoder) encodeValues(values []interface{}) error {
   161  	if b.lens == nil {
   162  		// this is just for the first time, maybe we need to copy and increase the cap in the next calls?
   163  		b.lens = make([]interface{}, 0, len(values))
   164  	}
   165  	for i := range values {
   166  		if err := b.writeOne(values[i].([]byte)); err != nil {
   167  			return err
   168  		}
   169  	}
   170  
   171  	return nil
   172  }
   173  
   174  func (b *byteArrayDeltaLengthEncoder) Close() error {
   175  	enc := &int32DeltaBPEncoder{
   176  		deltaBitPackEncoder32: deltaBitPackEncoder32{
   177  			blockSize:      128,
   178  			miniBlockCount: 4,
   179  		},
   180  	}
   181  
   182  	if err := encodeValue(b.w, enc, b.lens); err != nil {
   183  		return err
   184  	}
   185  
   186  	return writeFull(b.w, b.buf.Bytes())
   187  }
   188  
   189  type byteArrayDeltaDecoder struct {
   190  	suffixDecoder byteArrayDeltaLengthDecoder
   191  	prefixLens    []int32
   192  	previousValue []byte
   193  }
   194  
   195  func (d *byteArrayDeltaDecoder) init(r io.Reader) error {
   196  	lensDecoder := deltaBitPackDecoder32{}
   197  	if err := lensDecoder.init(r); err != nil {
   198  		return err
   199  	}
   200  
   201  	d.prefixLens = make([]int32, lensDecoder.valuesCount)
   202  	if err := decodeInt32(&lensDecoder, d.prefixLens); err != nil {
   203  		return err
   204  	}
   205  	if err := d.suffixDecoder.init(r); err != nil {
   206  		return err
   207  	}
   208  
   209  	if len(d.prefixLens) != len(d.suffixDecoder.lens) {
   210  		return errors.New("bytearray/delta: different number of suffixes and prefixes")
   211  	}
   212  	d.previousValue = make([]byte, 0)
   213  
   214  	return nil
   215  }
   216  
   217  func (d *byteArrayDeltaDecoder) decodeValues(dst []interface{}) (int, error) {
   218  	total := len(dst)
   219  	for i := 0; i < total; i++ {
   220  		suffix, err := d.suffixDecoder.next()
   221  		if err != nil {
   222  			return i, err
   223  		}
   224  		// after this line no error is acceptable
   225  		prefixLen := int(d.prefixLens[d.suffixDecoder.position-1])
   226  		value := make([]byte, 0, prefixLen+len(suffix))
   227  		if len(d.previousValue) < prefixLen {
   228  			// prevent panic from invalid input
   229  			return 0, fmt.Errorf("invalid prefix len in the stream, the value is %d byte but the it needs %d byte", len(d.previousValue), prefixLen)
   230  		}
   231  		if prefixLen > 0 {
   232  			value = append(value, d.previousValue[:prefixLen]...)
   233  		}
   234  		value = append(value, suffix...)
   235  		d.previousValue = value
   236  		dst[i] = value
   237  	}
   238  
   239  	return total, nil
   240  }
   241  
   242  type byteArrayDeltaEncoder struct {
   243  	w io.Writer
   244  
   245  	prefixLens    []interface{}
   246  	previousValue []byte
   247  
   248  	values *byteArrayDeltaLengthEncoder
   249  }
   250  
   251  func (b *byteArrayDeltaEncoder) init(w io.Writer) error {
   252  	b.w = w
   253  	b.prefixLens = nil
   254  	b.previousValue = []byte{}
   255  	b.values = &byteArrayDeltaLengthEncoder{}
   256  	return b.values.init(w)
   257  }
   258  
   259  func (b *byteArrayDeltaEncoder) encodeValues(values []interface{}) error {
   260  	if b.prefixLens == nil {
   261  		b.prefixLens = make([]interface{}, 0, len(values))
   262  		b.values.lens = make([]interface{}, 0, len(values))
   263  	}
   264  
   265  	for i := range values {
   266  		data := values[i].([]byte)
   267  		pLen := prefix(b.previousValue, data)
   268  		b.prefixLens = append(b.prefixLens, int32(pLen))
   269  		if err := b.values.writeOne(data[pLen:]); err != nil {
   270  			return err
   271  		}
   272  		b.previousValue = data
   273  	}
   274  
   275  	return nil
   276  }
   277  
   278  func (b *byteArrayDeltaEncoder) Close() error {
   279  	// write the lens first
   280  	enc := &int32DeltaBPEncoder{
   281  		deltaBitPackEncoder32: deltaBitPackEncoder32{
   282  			blockSize:      128,
   283  			miniBlockCount: 4,
   284  		},
   285  	}
   286  
   287  	if err := encodeValue(b.w, enc, b.prefixLens); err != nil {
   288  		return err
   289  	}
   290  
   291  	return b.values.Close()
   292  }
   293  
   294  type byteArrayStore struct {
   295  	repTyp    parquet.FieldRepetitionType
   296  	stats     statistics
   297  	pageStats statistics
   298  
   299  	*ColumnParameters
   300  }
   301  
   302  func (is *byteArrayStore) getStats() minMaxValues {
   303  	return &is.stats
   304  }
   305  
   306  func (is *byteArrayStore) getPageStats() minMaxValues {
   307  	return &is.pageStats
   308  }
   309  
   310  func (is *byteArrayStore) params() *ColumnParameters {
   311  	if is.ColumnParameters == nil {
   312  		panic("ColumnParameters is nil")
   313  	}
   314  	return is.ColumnParameters
   315  }
   316  
   317  func (is *byteArrayStore) sizeOf(v interface{}) int {
   318  	if vv, ok := v.([][]byte); ok {
   319  		l := 0
   320  		for _, vvv := range vv {
   321  			l += len(vvv)
   322  		}
   323  		return l
   324  	}
   325  	return len(v.([]byte))
   326  }
   327  
   328  func (is *byteArrayStore) parquetType() parquet.Type {
   329  	if is.TypeLength != nil && *is.TypeLength > 0 {
   330  		return parquet.Type_FIXED_LEN_BYTE_ARRAY
   331  	}
   332  	return parquet.Type_BYTE_ARRAY
   333  }
   334  
   335  func (is *byteArrayStore) repetitionType() parquet.FieldRepetitionType {
   336  	return is.repTyp
   337  }
   338  
   339  func (is *byteArrayStore) reset(repetitionType parquet.FieldRepetitionType) {
   340  	is.repTyp = repetitionType
   341  
   342  	is.stats.reset()
   343  	is.pageStats.reset()
   344  }
   345  
   346  func (is *byteArrayStore) setMinMax(j []byte) error {
   347  	if is.TypeLength != nil && *is.TypeLength > 0 && int32(len(j)) != *is.TypeLength {
   348  		return fmt.Errorf("the size of data should be %d but is %d", *is.TypeLength, len(j))
   349  	}
   350  	// For nil value there is no need to set the min/max
   351  	if j == nil {
   352  		return nil
   353  	}
   354  
   355  	is.stats.setMinMax(j)
   356  	is.pageStats.setMinMax(j)
   357  
   358  	return nil
   359  }
   360  
   361  func (is *byteArrayStore) getValues(v interface{}) ([]interface{}, error) {
   362  	var vals []interface{}
   363  	switch typed := v.(type) {
   364  	case []byte:
   365  		vals = []interface{}{typed}
   366  	case [][]byte:
   367  		if is.repTyp != parquet.FieldRepetitionType_REPEATED {
   368  			return nil, fmt.Errorf("the value is not repeated but it is an array")
   369  		}
   370  		vals = make([]interface{}, len(typed))
   371  		for j := range typed {
   372  			vals[j] = typed[j]
   373  		}
   374  	case string:
   375  		vals = []interface{}{[]byte(typed)}
   376  	case []string:
   377  		if is.repTyp != parquet.FieldRepetitionType_REPEATED {
   378  			return nil, fmt.Errorf("the value is not repeated but it is an array")
   379  		}
   380  		vals = make([]interface{}, len(typed))
   381  		for j := range typed {
   382  			vals[j] = []byte(typed[j])
   383  		}
   384  	default:
   385  		return nil, fmt.Errorf("unsupported type for storing in []byte column %T => %+v", v, v)
   386  	}
   387  
   388  	return vals, nil
   389  }
   390  
   391  func (*byteArrayStore) append(arrayIn interface{}, value interface{}) interface{} {
   392  	if arrayIn == nil {
   393  		arrayIn = make([][]byte, 0, 1)
   394  	}
   395  	return append(arrayIn.([][]byte), value.([]byte))
   396  }