github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/page_values.go (about)

     1  package parquet
     2  
     3  import (
     4  	"io"
     5  
     6  	"github.com/vc42/parquet-go/deprecated"
     7  	"github.com/vc42/parquet-go/encoding/plain"
     8  	"github.com/vc42/parquet-go/internal/unsafecast"
     9  )
    10  
    11  type optionalPageValues struct {
    12  	page   *optionalPage
    13  	values ValueReader
    14  	offset int
    15  }
    16  
    17  func (r *optionalPageValues) ReadValues(values []Value) (n int, err error) {
    18  	maxDefinitionLevel := r.page.maxDefinitionLevel
    19  	columnIndex := ^int16(r.page.Column())
    20  
    21  	for n < len(values) && r.offset < len(r.page.definitionLevels) {
    22  		for n < len(values) && r.offset < len(r.page.definitionLevels) && r.page.definitionLevels[r.offset] != maxDefinitionLevel {
    23  			values[n] = Value{
    24  				definitionLevel: r.page.definitionLevels[r.offset],
    25  				columnIndex:     columnIndex,
    26  			}
    27  			r.offset++
    28  			n++
    29  		}
    30  
    31  		i := n
    32  		j := r.offset
    33  		for i < len(values) && j < len(r.page.definitionLevels) && r.page.definitionLevels[j] == maxDefinitionLevel {
    34  			i++
    35  			j++
    36  		}
    37  
    38  		if n < i {
    39  			for j, err = r.values.ReadValues(values[n:i]); j > 0; j-- {
    40  				values[n].definitionLevel = maxDefinitionLevel
    41  				r.offset++
    42  				n++
    43  			}
    44  			// Do not return on an io.EOF here as we may still have null values to read.
    45  			if err != nil && err != io.EOF {
    46  				return n, err
    47  			}
    48  			err = nil
    49  		}
    50  	}
    51  
    52  	if r.offset == len(r.page.definitionLevels) {
    53  		err = io.EOF
    54  	}
    55  	return n, err
    56  }
    57  
    58  type repeatedPageValues struct {
    59  	page   *repeatedPage
    60  	values ValueReader
    61  	offset int
    62  }
    63  
    64  func (r *repeatedPageValues) ReadValues(values []Value) (n int, err error) {
    65  	maxDefinitionLevel := r.page.maxDefinitionLevel
    66  	columnIndex := ^int16(r.page.Column())
    67  
    68  	for n < len(values) && r.offset < len(r.page.definitionLevels) {
    69  		for n < len(values) && r.offset < len(r.page.definitionLevels) && r.page.definitionLevels[r.offset] != maxDefinitionLevel {
    70  			values[n] = Value{
    71  				repetitionLevel: r.page.repetitionLevels[r.offset],
    72  				definitionLevel: r.page.definitionLevels[r.offset],
    73  				columnIndex:     columnIndex,
    74  			}
    75  			r.offset++
    76  			n++
    77  		}
    78  
    79  		i := n
    80  		j := r.offset
    81  		for i < len(values) && j < len(r.page.definitionLevels) && r.page.definitionLevels[j] == maxDefinitionLevel {
    82  			i++
    83  			j++
    84  		}
    85  
    86  		if n < i {
    87  			for j, err = r.values.ReadValues(values[n:i]); j > 0; j-- {
    88  				values[n].repetitionLevel = r.page.repetitionLevels[r.offset]
    89  				values[n].definitionLevel = maxDefinitionLevel
    90  				r.offset++
    91  				n++
    92  			}
    93  			if err != nil && err != io.EOF {
    94  				return n, err
    95  			}
    96  			err = nil
    97  		}
    98  	}
    99  
   100  	if r.offset == len(r.page.definitionLevels) {
   101  		err = io.EOF
   102  	}
   103  	return n, err
   104  }
   105  
   106  type booleanPageValues struct {
   107  	page   *booleanPage
   108  	offset int
   109  }
   110  
   111  func (r *booleanPageValues) ReadBooleans(values []bool) (n int, err error) {
   112  	for n < len(values) && r.offset < int(r.page.numValues) {
   113  		values[n] = r.page.valueAt(r.offset)
   114  		r.offset++
   115  		n++
   116  	}
   117  	if r.offset == int(r.page.numValues) {
   118  		err = io.EOF
   119  	}
   120  	return n, err
   121  }
   122  
   123  func (r *booleanPageValues) ReadValues(values []Value) (n int, err error) {
   124  	for n < len(values) && r.offset < int(r.page.numValues) {
   125  		values[n] = r.page.makeValue(r.page.valueAt(r.offset))
   126  		r.offset++
   127  		n++
   128  	}
   129  	if r.offset == int(r.page.numValues) {
   130  		err = io.EOF
   131  	}
   132  	return n, err
   133  }
   134  
   135  type int32PageValues struct {
   136  	page   *int32Page
   137  	offset int
   138  }
   139  
   140  func (r *int32PageValues) Read(b []byte) (n int, err error) {
   141  	n, err = r.ReadInt32s(unsafecast.BytesToInt32(b))
   142  	return 4 * n, err
   143  }
   144  
   145  func (r *int32PageValues) ReadInt32s(values []int32) (n int, err error) {
   146  	n = copy(values, r.page.values[r.offset:])
   147  	r.offset += n
   148  	if r.offset == len(r.page.values) {
   149  		err = io.EOF
   150  	}
   151  	return n, err
   152  }
   153  
   154  func (r *int32PageValues) ReadValues(values []Value) (n int, err error) {
   155  	for n < len(values) && r.offset < len(r.page.values) {
   156  		values[n] = r.page.makeValue(r.page.values[r.offset])
   157  		r.offset++
   158  		n++
   159  	}
   160  	if r.offset == len(r.page.values) {
   161  		err = io.EOF
   162  	}
   163  	return n, err
   164  }
   165  
   166  type int64PageValues struct {
   167  	page   *int64Page
   168  	offset int
   169  }
   170  
   171  func (r *int64PageValues) Read(b []byte) (n int, err error) {
   172  	n, err = r.ReadInt64s(unsafecast.BytesToInt64(b))
   173  	return 8 * n, err
   174  }
   175  
   176  func (r *int64PageValues) ReadInt64s(values []int64) (n int, err error) {
   177  	n = copy(values, r.page.values[r.offset:])
   178  	r.offset += n
   179  	if r.offset == len(r.page.values) {
   180  		err = io.EOF
   181  	}
   182  	return n, err
   183  }
   184  
   185  func (r *int64PageValues) ReadValues(values []Value) (n int, err error) {
   186  	for n < len(values) && r.offset < len(r.page.values) {
   187  		values[n] = r.page.makeValue(r.page.values[r.offset])
   188  		r.offset++
   189  		n++
   190  	}
   191  	if r.offset == len(r.page.values) {
   192  		err = io.EOF
   193  	}
   194  	return n, err
   195  }
   196  
   197  type int96PageValues struct {
   198  	page   *int96Page
   199  	offset int
   200  }
   201  
   202  func (r *int96PageValues) Read(b []byte) (n int, err error) {
   203  	n, err = r.ReadInt96s(deprecated.BytesToInt96(b))
   204  	return 12 * n, err
   205  }
   206  
   207  func (r *int96PageValues) ReadInt96s(values []deprecated.Int96) (n int, err error) {
   208  	n = copy(values, r.page.values[r.offset:])
   209  	r.offset += n
   210  	if r.offset == len(r.page.values) {
   211  		err = io.EOF
   212  	}
   213  	return n, err
   214  }
   215  
   216  func (r *int96PageValues) ReadValues(values []Value) (n int, err error) {
   217  	for n < len(values) && r.offset < len(r.page.values) {
   218  		values[n] = r.page.makeValue(r.page.values[r.offset])
   219  		r.offset++
   220  		n++
   221  	}
   222  	if r.offset == len(r.page.values) {
   223  		err = io.EOF
   224  	}
   225  	return n, err
   226  }
   227  
   228  type floatPageValues struct {
   229  	page   *floatPage
   230  	offset int
   231  }
   232  
   233  func (r *floatPageValues) Read(b []byte) (n int, err error) {
   234  	n, err = r.ReadFloats(unsafecast.BytesToFloat32(b))
   235  	return 4 * n, err
   236  }
   237  
   238  func (r *floatPageValues) ReadFloats(values []float32) (n int, err error) {
   239  	n = copy(values, r.page.values[r.offset:])
   240  	r.offset += n
   241  	if r.offset == len(r.page.values) {
   242  		err = io.EOF
   243  	}
   244  	return n, err
   245  }
   246  
   247  func (r *floatPageValues) ReadValues(values []Value) (n int, err error) {
   248  	for n < len(values) && r.offset < len(r.page.values) {
   249  		values[n] = r.page.makeValue(r.page.values[r.offset])
   250  		r.offset++
   251  		n++
   252  	}
   253  	if r.offset == len(r.page.values) {
   254  		err = io.EOF
   255  	}
   256  	return n, err
   257  }
   258  
   259  type doublePageValues struct {
   260  	page   *doublePage
   261  	offset int
   262  }
   263  
   264  func (r *doublePageValues) Read(b []byte) (n int, err error) {
   265  	n, err = r.ReadDoubles(unsafecast.BytesToFloat64(b))
   266  	return 8 * n, err
   267  }
   268  
   269  func (r *doublePageValues) ReadDoubles(values []float64) (n int, err error) {
   270  	n = copy(values, r.page.values[r.offset:])
   271  	r.offset += n
   272  	if r.offset == len(r.page.values) {
   273  		err = io.EOF
   274  	}
   275  	return n, err
   276  }
   277  
   278  func (r *doublePageValues) ReadValues(values []Value) (n int, err error) {
   279  	for n < len(values) && r.offset < len(r.page.values) {
   280  		values[n] = r.page.makeValue(r.page.values[r.offset])
   281  		r.offset++
   282  		n++
   283  	}
   284  	if r.offset == len(r.page.values) {
   285  		err = io.EOF
   286  	}
   287  	return n, err
   288  }
   289  
   290  type byteArrayPageValues struct {
   291  	page   *byteArrayPage
   292  	offset int
   293  }
   294  
   295  func (r *byteArrayPageValues) Read(b []byte) (int, error) {
   296  	_, n, err := r.readByteArrays(b)
   297  	return n, err
   298  }
   299  
   300  func (r *byteArrayPageValues) ReadRequired(values []byte) (int, error) {
   301  	return r.ReadByteArrays(values)
   302  }
   303  
   304  func (r *byteArrayPageValues) ReadByteArrays(values []byte) (int, error) {
   305  	n, _, err := r.readByteArrays(values)
   306  	return n, err
   307  }
   308  
   309  func (r *byteArrayPageValues) readByteArrays(values []byte) (c, n int, err error) {
   310  	for r.offset < len(r.page.values) {
   311  		b := r.page.valueAt(uint32(r.offset))
   312  		k := plain.ByteArrayLengthSize + len(b)
   313  		if k > (len(values) - n) {
   314  			break
   315  		}
   316  		plain.PutByteArrayLength(values[n:], len(b))
   317  		n += plain.ByteArrayLengthSize
   318  		n += copy(values[n:], b)
   319  		r.offset += plain.ByteArrayLengthSize
   320  		r.offset += len(b)
   321  		c++
   322  	}
   323  	if r.offset == len(r.page.values) {
   324  		err = io.EOF
   325  	} else if n == 0 && len(values) > 0 {
   326  		err = io.ErrShortBuffer
   327  	}
   328  	return c, n, err
   329  }
   330  
   331  func (r *byteArrayPageValues) ReadValues(values []Value) (n int, err error) {
   332  	for n < len(values) && r.offset < len(r.page.values) {
   333  		value := r.page.valueAt(uint32(r.offset))
   334  		values[n] = r.page.makeValueBytes(copyBytes(value))
   335  		r.offset += plain.ByteArrayLengthSize
   336  		r.offset += len(value)
   337  		n++
   338  	}
   339  	if r.offset == len(r.page.values) {
   340  		err = io.EOF
   341  	}
   342  	return n, err
   343  }
   344  
   345  type fixedLenByteArrayPageValues struct {
   346  	page   *fixedLenByteArrayPage
   347  	offset int
   348  }
   349  
   350  func (r *fixedLenByteArrayPageValues) Read(b []byte) (n int, err error) {
   351  	n, err = r.ReadFixedLenByteArrays(b)
   352  	return n * r.page.size, err
   353  }
   354  
   355  func (r *fixedLenByteArrayPageValues) ReadRequired(values []byte) (int, error) {
   356  	return r.ReadFixedLenByteArrays(values)
   357  }
   358  
   359  func (r *fixedLenByteArrayPageValues) ReadFixedLenByteArrays(values []byte) (n int, err error) {
   360  	n = copy(values, r.page.data[r.offset:]) / r.page.size
   361  	r.offset += n * r.page.size
   362  	if r.offset == len(r.page.data) {
   363  		err = io.EOF
   364  	} else if n == 0 && len(values) > 0 {
   365  		err = io.ErrShortBuffer
   366  	}
   367  	return n, err
   368  }
   369  
   370  func (r *fixedLenByteArrayPageValues) ReadValues(values []Value) (n int, err error) {
   371  	for n < len(values) && r.offset < len(r.page.data) {
   372  		values[n] = r.page.makeValueBytes(copyBytes(r.page.data[r.offset : r.offset+r.page.size]))
   373  		r.offset += r.page.size
   374  		n++
   375  	}
   376  	if r.offset == len(r.page.data) {
   377  		err = io.EOF
   378  	}
   379  	return n, err
   380  }
   381  
   382  type uint32PageValues struct {
   383  	page   *uint32Page
   384  	offset int
   385  }
   386  
   387  func (r *uint32PageValues) Read(b []byte) (n int, err error) {
   388  	n, err = r.ReadUint32s(unsafecast.BytesToUint32(b))
   389  	return 4 * n, err
   390  }
   391  
   392  func (r *uint32PageValues) ReadUint32s(values []uint32) (n int, err error) {
   393  	n = copy(values, r.page.values[r.offset:])
   394  	r.offset += n
   395  	if r.offset == len(r.page.values) {
   396  		err = io.EOF
   397  	}
   398  	return n, err
   399  }
   400  
   401  func (r *uint32PageValues) ReadValues(values []Value) (n int, err error) {
   402  	for n < len(values) && r.offset < len(r.page.values) {
   403  		values[n] = r.page.makeValue(r.page.values[r.offset])
   404  		r.offset++
   405  		n++
   406  	}
   407  	if r.offset == len(r.page.values) {
   408  		err = io.EOF
   409  	}
   410  	return n, err
   411  }
   412  
   413  type uint64PageValues struct {
   414  	page   *uint64Page
   415  	offset int
   416  }
   417  
   418  func (r *uint64PageValues) Read(b []byte) (n int, err error) {
   419  	n, err = r.ReadUint64s(unsafecast.BytesToUint64(b))
   420  	return 8 * n, err
   421  }
   422  
   423  func (r *uint64PageValues) ReadUint64s(values []uint64) (n int, err error) {
   424  	n = copy(values, r.page.values[r.offset:])
   425  	r.offset += n
   426  	if r.offset == len(r.page.values) {
   427  		err = io.EOF
   428  	}
   429  	return n, err
   430  }
   431  
   432  func (r *uint64PageValues) ReadValues(values []Value) (n int, err error) {
   433  	for n < len(values) && r.offset < len(r.page.values) {
   434  		values[n] = r.page.makeValue(r.page.values[r.offset])
   435  		r.offset++
   436  		n++
   437  	}
   438  	if r.offset == len(r.page.values) {
   439  		err = io.EOF
   440  	}
   441  	return n, err
   442  }
   443  
   444  type be128PageValues struct {
   445  	page   *be128Page
   446  	offset int
   447  }
   448  
   449  func (r *be128PageValues) ReadValues(values []Value) (n int, err error) {
   450  	for n < len(values) && r.offset < len(r.page.values) {
   451  		values[n] = r.page.makeValue(&r.page.values[r.offset])
   452  		r.offset++
   453  		n++
   454  	}
   455  	if r.offset == len(r.page.values) {
   456  		err = io.EOF
   457  	}
   458  	return n, err
   459  }
   460  
   461  type nullPageValues struct {
   462  	column int
   463  	remain int
   464  }
   465  
   466  func (r *nullPageValues) ReadValues(values []Value) (n int, err error) {
   467  	columnIndex := ^int16(r.column)
   468  	values = values[:min(r.remain, len(values))]
   469  	for i := range values {
   470  		values[i] = Value{columnIndex: columnIndex}
   471  	}
   472  	r.remain -= len(values)
   473  	if r.remain == 0 {
   474  		err = io.EOF
   475  	}
   476  	return len(values), err
   477  }