github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/column_chunk.go (about)

     1  package parquet
     2  
     3  import (
     4  	"io"
     5  )
     6  
     7  // The ColumnChunk interface represents individual columns of a row group.
     8  type ColumnChunk interface {
     9  	// Returns the column type.
    10  	Type() Type
    11  
    12  	// Returns the index of this column in its parent row group.
    13  	Column() int
    14  
    15  	// Returns a reader exposing the pages of the column.
    16  	Pages() Pages
    17  
    18  	// Returns the components of the page index for this column chunk,
    19  	// containing details about the content and location of pages within the
    20  	// chunk.
    21  	//
    22  	// Note that the returned value may be the same across calls to these
    23  	// methods, programs must treat those as read-only.
    24  	//
    25  	// If the column chunk does not have a page index, the methods return nil.
    26  	ColumnIndex() ColumnIndex
    27  	OffsetIndex() OffsetIndex
    28  	BloomFilter() BloomFilter
    29  
    30  	// Returns the number of values in the column chunk.
    31  	//
    32  	// This quantity may differ from the number of rows in the parent row group
    33  	// because repeated columns may hold zero or more values per row.
    34  	NumValues() int64
    35  }
    36  
    37  type pageAndValueWriter interface {
    38  	PageWriter
    39  	ValueWriter
    40  }
    41  
    42  type readRowsFunc func(*rowGroupRows, []Row, byte) (int, error)
    43  
    44  func readRowsFuncOf(node Node, columnIndex int, repetitionDepth byte) (int, readRowsFunc) {
    45  	var read readRowsFunc
    46  
    47  	if node.Repeated() {
    48  		repetitionDepth++
    49  	}
    50  
    51  	if node.Leaf() {
    52  		columnIndex, read = readRowsFuncOfLeaf(columnIndex, repetitionDepth)
    53  	} else {
    54  		columnIndex, read = readRowsFuncOfGroup(node, columnIndex, repetitionDepth)
    55  	}
    56  
    57  	if node.Repeated() {
    58  		read = readRowsFuncOfRepeated(read, repetitionDepth)
    59  	}
    60  
    61  	return columnIndex, read
    62  }
    63  
    64  //go:noinline
    65  func readRowsFuncOfRepeated(read readRowsFunc, repetitionDepth byte) readRowsFunc {
    66  	return func(r *rowGroupRows, rows []Row, repetitionLevel byte) (int, error) {
    67  		for i := range rows {
    68  			// Repeated columns have variable number of values, we must process
    69  			// them one row at a time because we cannot predict how many values
    70  			// need to be consumed in each iteration.
    71  			row := rows[i : i+1]
    72  
    73  			// The first pass looks for values marking the beginning of a row by
    74  			// having a repetition level equal to the current level.
    75  			n, err := read(r, row, repetitionLevel)
    76  			if err != nil {
    77  				// The error here may likely be io.EOF, the read function may
    78  				// also have successfully read a row, which is indicated by a
    79  				// non-zero count. In this case, we increment the index to
    80  				// indicate to the caller than rows up to i+1 have been read.
    81  				if n > 0 {
    82  					i++
    83  				}
    84  				return i, err
    85  			}
    86  
    87  			// The read function may return no errors and also read no rows in
    88  			// case where it had more values to read but none corresponded to
    89  			// the current repetition level. This is an indication that we will
    90  			// not be able to read more rows at this stage, we must return to
    91  			// the caller to let it set the repetition level to its current
    92  			// depth, which may allow us to read more values when called again.
    93  			if n == 0 {
    94  				return i, nil
    95  			}
    96  
    97  			// When we reach this stage, we have successfully read the first
    98  			// values of a row of repeated columns. We continue consuming more
    99  			// repeated values until we get the indication that we consumed
   100  			// them all (the read function returns zero and no errors).
   101  			for {
   102  				n, err := read(r, row, repetitionDepth)
   103  				if err != nil {
   104  					return i + 1, err
   105  				}
   106  				if n == 0 {
   107  					break
   108  				}
   109  			}
   110  		}
   111  		return len(rows), nil
   112  	}
   113  }
   114  
   115  //go:noinline
   116  func readRowsFuncOfGroup(node Node, columnIndex int, repetitionDepth byte) (int, readRowsFunc) {
   117  	fields := node.Fields()
   118  
   119  	if len(fields) == 0 {
   120  		return columnIndex, func(*rowGroupRows, []Row, byte) (int, error) {
   121  			return 0, io.EOF
   122  		}
   123  	}
   124  
   125  	if len(fields) == 1 {
   126  		// Small optimization for a somewhat common case of groups with a single
   127  		// column (like nested list elements for example); there is no need to
   128  		// loop over the group of a single element, we can simply skip to calling
   129  		// the inner read function.
   130  		return readRowsFuncOf(fields[0], columnIndex, repetitionDepth)
   131  	}
   132  
   133  	group := make([]readRowsFunc, len(fields))
   134  	for i := range group {
   135  		columnIndex, group[i] = readRowsFuncOf(fields[i], columnIndex, repetitionDepth)
   136  	}
   137  
   138  	return columnIndex, func(r *rowGroupRows, rows []Row, repetitionLevel byte) (int, error) {
   139  		// When reading a group, we use the first column as an indicator of how
   140  		// may rows can be read during this call.
   141  		n, err := group[0](r, rows, repetitionLevel)
   142  
   143  		if n > 0 {
   144  			// Read values for all rows that the group is able to consume.
   145  			// Getting io.EOF from calling the read functions indicate that
   146  			// we consumed all values of that particular column, but there may
   147  			// be more to read in other columns, therefore we must always read
   148  			// all columns and cannot stop on the first error.
   149  			for _, read := range group[1:] {
   150  				_, err2 := read(r, rows[:n], repetitionLevel)
   151  				if err2 != nil && err2 != io.EOF {
   152  					return 0, err2
   153  				}
   154  			}
   155  		}
   156  
   157  		return n, err
   158  	}
   159  }
   160  
   161  //go:noinline
   162  func readRowsFuncOfLeaf(columnIndex int, repetitionDepth byte) (int, readRowsFunc) {
   163  	var read readRowsFunc
   164  
   165  	if repetitionDepth == 0 {
   166  		read = func(r *rowGroupRows, rows []Row, _ byte) (int, error) {
   167  			// When the repetition depth is zero, we know that there is exactly
   168  			// one value per row for this column, and therefore we can consume
   169  			// as many values as there are rows to fill.
   170  			col := &r.columns[columnIndex]
   171  			buf := r.buffer(columnIndex)
   172  
   173  			for i := range rows {
   174  				if col.offset == col.length {
   175  					n, err := col.values.ReadValues(buf)
   176  					col.offset = 0
   177  					col.length = int32(n)
   178  					if n == 0 && err != nil {
   179  						return 0, err
   180  					}
   181  				}
   182  
   183  				rows[i] = append(rows[i], buf[col.offset])
   184  				col.offset++
   185  			}
   186  
   187  			return len(rows), nil
   188  		}
   189  	} else {
   190  		read = func(r *rowGroupRows, rows []Row, repetitionLevel byte) (int, error) {
   191  			// When the repetition depth is not zero, we know that we will be
   192  			// called with a single row as input. We attempt to read at most one
   193  			// value of a single row and return to the caller.
   194  			col := &r.columns[columnIndex]
   195  			buf := r.buffer(columnIndex)
   196  
   197  			if col.offset == col.length {
   198  				n, err := col.values.ReadValues(buf)
   199  				col.offset = 0
   200  				col.length = int32(n)
   201  				if n == 0 && err != nil {
   202  					return 0, err
   203  				}
   204  			}
   205  
   206  			if buf[col.offset].repetitionLevel != repetitionLevel {
   207  				return 0, nil
   208  			}
   209  
   210  			rows[0] = append(rows[0], buf[col.offset])
   211  			col.offset++
   212  			return 1, nil
   213  		}
   214  	}
   215  
   216  	return columnIndex + 1, read
   217  }