github.com/parquet-go/parquet-go@v0.20.0/column_chunk.go (about)

     1  package parquet
     2  
     3  import (
     4  	"errors"
     5  	"io"
     6  )
     7  
     8  var (
     9  	ErrMissingColumnIndex = errors.New("missing column index")
    10  	ErrMissingOffsetIndex = errors.New("missing offset index")
    11  )
    12  
    13  // The ColumnChunk interface represents individual columns of a row group.
    14  type ColumnChunk interface {
    15  	// Returns the column type.
    16  	Type() Type
    17  
    18  	// Returns the index of this column in its parent row group.
    19  	Column() int
    20  
    21  	// Returns a reader exposing the pages of the column.
    22  	Pages() Pages
    23  
    24  	// Returns the components of the page index for this column chunk,
    25  	// containing details about the content and location of pages within the
    26  	// chunk.
    27  	//
    28  	// Note that the returned value may be the same across calls to these
    29  	// methods, programs must treat those as read-only.
    30  	//
    31  	// If the column chunk does not have a column or offset index, the methods return
    32  	// ErrMissingColumnIndex or ErrMissingOffsetIndex respectively.
    33  	//
    34  	// Prior to v0.20, these methods did not return an error because the page index
    35  	// for a file was either fully read when the file was opened, or skipped
    36  	// completely using the parquet.SkipPageIndex option. Version v0.20 introduced a
    37  	// change that the page index can be read on-demand at any time, even if a file
    38  	// was opened with the parquet.SkipPageIndex option. Since reading the page index
    39  	// can fail, these methods now return an error.
    40  	ColumnIndex() (ColumnIndex, error)
    41  	OffsetIndex() (OffsetIndex, error)
    42  	BloomFilter() BloomFilter
    43  
    44  	// Returns the number of values in the column chunk.
    45  	//
    46  	// This quantity may differ from the number of rows in the parent row group
    47  	// because repeated columns may hold zero or more values per row.
    48  	NumValues() int64
    49  }
    50  
    51  type pageAndValueWriter interface {
    52  	PageWriter
    53  	ValueWriter
    54  }
    55  
    56  type readRowsFunc func(*rowGroupRows, []Row, byte) (int, error)
    57  
    58  func readRowsFuncOf(node Node, columnIndex int, repetitionDepth byte) (int, readRowsFunc) {
    59  	var read readRowsFunc
    60  
    61  	if node.Repeated() {
    62  		repetitionDepth++
    63  	}
    64  
    65  	if node.Leaf() {
    66  		columnIndex, read = readRowsFuncOfLeaf(columnIndex, repetitionDepth)
    67  	} else {
    68  		columnIndex, read = readRowsFuncOfGroup(node, columnIndex, repetitionDepth)
    69  	}
    70  
    71  	if node.Repeated() {
    72  		read = readRowsFuncOfRepeated(read, repetitionDepth)
    73  	}
    74  
    75  	return columnIndex, read
    76  }
    77  
    78  //go:noinline
    79  func readRowsFuncOfRepeated(read readRowsFunc, repetitionDepth byte) readRowsFunc {
    80  	return func(r *rowGroupRows, rows []Row, repetitionLevel byte) (int, error) {
    81  		for i := range rows {
    82  			// Repeated columns have variable number of values, we must process
    83  			// them one row at a time because we cannot predict how many values
    84  			// need to be consumed in each iteration.
    85  			row := rows[i : i+1]
    86  
    87  			// The first pass looks for values marking the beginning of a row by
    88  			// having a repetition level equal to the current level.
    89  			n, err := read(r, row, repetitionLevel)
    90  			if err != nil {
    91  				// The error here may likely be io.EOF, the read function may
    92  				// also have successfully read a row, which is indicated by a
    93  				// non-zero count. In this case, we increment the index to
    94  				// indicate to the caller than rows up to i+1 have been read.
    95  				if n > 0 {
    96  					i++
    97  				}
    98  				return i, err
    99  			}
   100  
   101  			// The read function may return no errors and also read no rows in
   102  			// case where it had more values to read but none corresponded to
   103  			// the current repetition level. This is an indication that we will
   104  			// not be able to read more rows at this stage, we must return to
   105  			// the caller to let it set the repetition level to its current
   106  			// depth, which may allow us to read more values when called again.
   107  			if n == 0 {
   108  				return i, nil
   109  			}
   110  
   111  			// When we reach this stage, we have successfully read the first
   112  			// values of a row of repeated columns. We continue consuming more
   113  			// repeated values until we get the indication that we consumed
   114  			// them all (the read function returns zero and no errors).
   115  			for {
   116  				n, err := read(r, row, repetitionDepth)
   117  				if err != nil {
   118  					return i + 1, err
   119  				}
   120  				if n == 0 {
   121  					break
   122  				}
   123  			}
   124  		}
   125  		return len(rows), nil
   126  	}
   127  }
   128  
   129  //go:noinline
   130  func readRowsFuncOfGroup(node Node, columnIndex int, repetitionDepth byte) (int, readRowsFunc) {
   131  	fields := node.Fields()
   132  
   133  	if len(fields) == 0 {
   134  		return columnIndex, func(*rowGroupRows, []Row, byte) (int, error) {
   135  			return 0, io.EOF
   136  		}
   137  	}
   138  
   139  	if len(fields) == 1 {
   140  		// Small optimization for a somewhat common case of groups with a single
   141  		// column (like nested list elements for example); there is no need to
   142  		// loop over the group of a single element, we can simply skip to calling
   143  		// the inner read function.
   144  		return readRowsFuncOf(fields[0], columnIndex, repetitionDepth)
   145  	}
   146  
   147  	group := make([]readRowsFunc, len(fields))
   148  	for i := range group {
   149  		columnIndex, group[i] = readRowsFuncOf(fields[i], columnIndex, repetitionDepth)
   150  	}
   151  
   152  	return columnIndex, func(r *rowGroupRows, rows []Row, repetitionLevel byte) (int, error) {
   153  		// When reading a group, we use the first column as an indicator of how
   154  		// may rows can be read during this call.
   155  		n, err := group[0](r, rows, repetitionLevel)
   156  
   157  		if n > 0 {
   158  			// Read values for all rows that the group is able to consume.
   159  			// Getting io.EOF from calling the read functions indicate that
   160  			// we consumed all values of that particular column, but there may
   161  			// be more to read in other columns, therefore we must always read
   162  			// all columns and cannot stop on the first error.
   163  			for _, read := range group[1:] {
   164  				_, err2 := read(r, rows[:n], repetitionLevel)
   165  				if err2 != nil && err2 != io.EOF {
   166  					return 0, err2
   167  				}
   168  			}
   169  		}
   170  
   171  		return n, err
   172  	}
   173  }
   174  
   175  //go:noinline
   176  func readRowsFuncOfLeaf(columnIndex int, repetitionDepth byte) (int, readRowsFunc) {
   177  	var read readRowsFunc
   178  
   179  	if repetitionDepth == 0 {
   180  		read = func(r *rowGroupRows, rows []Row, _ byte) (int, error) {
   181  			// When the repetition depth is zero, we know that there is exactly
   182  			// one value per row for this column, and therefore we can consume
   183  			// as many values as there are rows to fill.
   184  			col := &r.columns[columnIndex]
   185  			buf := r.buffer(columnIndex)
   186  
   187  			for i := range rows {
   188  				if col.offset == col.length {
   189  					n, err := col.values.ReadValues(buf)
   190  					col.offset = 0
   191  					col.length = int32(n)
   192  					if n == 0 && err != nil {
   193  						return 0, err
   194  					}
   195  				}
   196  
   197  				rows[i] = append(rows[i], buf[col.offset])
   198  				col.offset++
   199  			}
   200  
   201  			return len(rows), nil
   202  		}
   203  	} else {
   204  		read = func(r *rowGroupRows, rows []Row, repetitionLevel byte) (int, error) {
   205  			// When the repetition depth is not zero, we know that we will be
   206  			// called with a single row as input. We attempt to read at most one
   207  			// value of a single row and return to the caller.
   208  			col := &r.columns[columnIndex]
   209  			buf := r.buffer(columnIndex)
   210  
   211  			if col.offset == col.length {
   212  				n, err := col.values.ReadValues(buf)
   213  				col.offset = 0
   214  				col.length = int32(n)
   215  				if n == 0 && err != nil {
   216  					return 0, err
   217  				}
   218  			}
   219  
   220  			if buf[col.offset].repetitionLevel != repetitionLevel {
   221  				return 0, nil
   222  			}
   223  
   224  			rows[0] = append(rows[0], buf[col.offset])
   225  			col.offset++
   226  			return 1, nil
   227  		}
   228  	}
   229  
   230  	return columnIndex + 1, read
   231  }