github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/column_chunk.go (about) 1 package parquet 2 3 import ( 4 "errors" 5 "io" 6 ) 7 8 var ( 9 ErrMissingColumnIndex = errors.New("missing column index") 10 ErrMissingOffsetIndex = errors.New("missing offset index") 11 ) 12 13 // The ColumnChunk interface represents individual columns of a row group. 14 type ColumnChunk interface { 15 // Returns the column type. 16 Type() Type 17 18 // Returns the index of this column in its parent row group. 19 Column() int 20 21 // Returns a reader exposing the pages of the column. 22 Pages() Pages 23 24 // Returns the components of the page index for this column chunk, 25 // containing details about the content and location of pages within the 26 // chunk. 27 // 28 // Note that the returned value may be the same across calls to these 29 // methods, programs must treat those as read-only. 30 // 31 // If the column chunk does not have a column or offset index, the methods return 32 // ErrMissingColumnIndex or ErrMissingOffsetIndex respectively. 33 // 34 // Prior to v0.20, these methods did not return an error because the page index 35 // for a file was either fully read when the file was opened, or skipped 36 // completely using the parquet.SkipPageIndex option. Version v0.20 introduced a 37 // change that the page index can be read on-demand at any time, even if a file 38 // was opened with the parquet.SkipPageIndex option. Since reading the page index 39 // can fail, these methods now return an error. 40 ColumnIndex() (ColumnIndex, error) 41 OffsetIndex() (OffsetIndex, error) 42 BloomFilter() BloomFilter 43 44 // Returns the number of values in the column chunk. 45 // 46 // This quantity may differ from the number of rows in the parent row group 47 // because repeated columns may hold zero or more values per row. 48 NumValues() int64 49 } 50 51 type pageAndValueWriter interface { 52 PageWriter 53 ValueWriter 54 } 55 56 type readRowsFunc func(*rowGroupRows, []Row, byte) (int, error) 57 58 func readRowsFuncOf(node Node, columnIndex int, repetitionDepth byte) (int, readRowsFunc) { 59 var read readRowsFunc 60 61 if node.Repeated() { 62 repetitionDepth++ 63 } 64 65 if node.Leaf() { 66 columnIndex, read = readRowsFuncOfLeaf(columnIndex, repetitionDepth) 67 } else { 68 columnIndex, read = readRowsFuncOfGroup(node, columnIndex, repetitionDepth) 69 } 70 71 if node.Repeated() { 72 read = readRowsFuncOfRepeated(read, repetitionDepth) 73 } 74 75 return columnIndex, read 76 } 77 78 //go:noinline 79 func readRowsFuncOfRepeated(read readRowsFunc, repetitionDepth byte) readRowsFunc { 80 return func(r *rowGroupRows, rows []Row, repetitionLevel byte) (int, error) { 81 for i := range rows { 82 // Repeated columns have variable number of values, we must process 83 // them one row at a time because we cannot predict how many values 84 // need to be consumed in each iteration. 85 row := rows[i : i+1] 86 87 // The first pass looks for values marking the beginning of a row by 88 // having a repetition level equal to the current level. 89 n, err := read(r, row, repetitionLevel) 90 if err != nil { 91 // The error here may likely be io.EOF, the read function may 92 // also have successfully read a row, which is indicated by a 93 // non-zero count. In this case, we increment the index to 94 // indicate to the caller than rows up to i+1 have been read. 95 if n > 0 { 96 i++ 97 } 98 return i, err 99 } 100 101 // The read function may return no errors and also read no rows in 102 // case where it had more values to read but none corresponded to 103 // the current repetition level. This is an indication that we will 104 // not be able to read more rows at this stage, we must return to 105 // the caller to let it set the repetition level to its current 106 // depth, which may allow us to read more values when called again. 107 if n == 0 { 108 return i, nil 109 } 110 111 // When we reach this stage, we have successfully read the first 112 // values of a row of repeated columns. We continue consuming more 113 // repeated values until we get the indication that we consumed 114 // them all (the read function returns zero and no errors). 115 for { 116 n, err := read(r, row, repetitionDepth) 117 if err != nil { 118 return i + 1, err 119 } 120 if n == 0 { 121 break 122 } 123 } 124 } 125 return len(rows), nil 126 } 127 } 128 129 //go:noinline 130 func readRowsFuncOfGroup(node Node, columnIndex int, repetitionDepth byte) (int, readRowsFunc) { 131 fields := node.Fields() 132 133 if len(fields) == 0 { 134 return columnIndex, func(*rowGroupRows, []Row, byte) (int, error) { 135 return 0, io.EOF 136 } 137 } 138 139 if len(fields) == 1 { 140 // Small optimization for a somewhat common case of groups with a single 141 // column (like nested list elements for example); there is no need to 142 // loop over the group of a single element, we can simply skip to calling 143 // the inner read function. 144 return readRowsFuncOf(fields[0], columnIndex, repetitionDepth) 145 } 146 147 group := make([]readRowsFunc, len(fields)) 148 for i := range group { 149 columnIndex, group[i] = readRowsFuncOf(fields[i], columnIndex, repetitionDepth) 150 } 151 152 return columnIndex, func(r *rowGroupRows, rows []Row, repetitionLevel byte) (int, error) { 153 // When reading a group, we use the first column as an indicator of how 154 // may rows can be read during this call. 155 n, err := group[0](r, rows, repetitionLevel) 156 157 if n > 0 { 158 // Read values for all rows that the group is able to consume. 159 // Getting io.EOF from calling the read functions indicate that 160 // we consumed all values of that particular column, but there may 161 // be more to read in other columns, therefore we must always read 162 // all columns and cannot stop on the first error. 163 for _, read := range group[1:] { 164 _, err2 := read(r, rows[:n], repetitionLevel) 165 if err2 != nil && err2 != io.EOF { 166 return 0, err2 167 } 168 } 169 } 170 171 return n, err 172 } 173 } 174 175 //go:noinline 176 func readRowsFuncOfLeaf(columnIndex int, repetitionDepth byte) (int, readRowsFunc) { 177 var read readRowsFunc 178 179 if repetitionDepth == 0 { 180 read = func(r *rowGroupRows, rows []Row, _ byte) (int, error) { 181 // When the repetition depth is zero, we know that there is exactly 182 // one value per row for this column, and therefore we can consume 183 // as many values as there are rows to fill. 184 col := &r.columns[columnIndex] 185 buf := r.buffer(columnIndex) 186 187 for i := range rows { 188 if col.offset == col.length { 189 n, err := col.values.ReadValues(buf) 190 col.offset = 0 191 col.length = int32(n) 192 if n == 0 && err != nil { 193 return 0, err 194 } 195 } 196 197 rows[i] = append(rows[i], buf[col.offset]) 198 col.offset++ 199 } 200 201 return len(rows), nil 202 } 203 } else { 204 read = func(r *rowGroupRows, rows []Row, repetitionLevel byte) (int, error) { 205 // When the repetition depth is not zero, we know that we will be 206 // called with a single row as input. We attempt to read at most one 207 // value of a single row and return to the caller. 208 col := &r.columns[columnIndex] 209 buf := r.buffer(columnIndex) 210 211 if col.offset == col.length { 212 n, err := col.values.ReadValues(buf) 213 col.offset = 0 214 col.length = int32(n) 215 if n == 0 && err != nil { 216 return 0, err 217 } 218 } 219 220 if buf[col.offset].repetitionLevel != repetitionLevel { 221 return 0, nil 222 } 223 224 rows[0] = append(rows[0], buf[col.offset]) 225 col.offset++ 226 return 1, nil 227 } 228 } 229 230 return columnIndex + 1, read 231 }