github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/column_chunk.go (about) 1 package parquet 2 3 import ( 4 "io" 5 ) 6 7 // The ColumnChunk interface represents individual columns of a row group. 8 type ColumnChunk interface { 9 // Returns the column type. 10 Type() Type 11 12 // Returns the index of this column in its parent row group. 13 Column() int 14 15 // Returns a reader exposing the pages of the column. 16 Pages() Pages 17 18 // Returns the components of the page index for this column chunk, 19 // containing details about the content and location of pages within the 20 // chunk. 21 // 22 // Note that the returned value may be the same across calls to these 23 // methods, programs must treat those as read-only. 24 // 25 // If the column chunk does not have a page index, the methods return nil. 26 ColumnIndex() ColumnIndex 27 OffsetIndex() OffsetIndex 28 BloomFilter() BloomFilter 29 30 // Returns the number of values in the column chunk. 31 // 32 // This quantity may differ from the number of rows in the parent row group 33 // because repeated columns may hold zero or more values per row. 34 NumValues() int64 35 } 36 37 type pageAndValueWriter interface { 38 PageWriter 39 ValueWriter 40 } 41 42 type readRowsFunc func(*rowGroupRows, []Row, byte) (int, error) 43 44 func readRowsFuncOf(node Node, columnIndex int, repetitionDepth byte) (int, readRowsFunc) { 45 var read readRowsFunc 46 47 if node.Repeated() { 48 repetitionDepth++ 49 } 50 51 if node.Leaf() { 52 columnIndex, read = readRowsFuncOfLeaf(columnIndex, repetitionDepth) 53 } else { 54 columnIndex, read = readRowsFuncOfGroup(node, columnIndex, repetitionDepth) 55 } 56 57 if node.Repeated() { 58 read = readRowsFuncOfRepeated(read, repetitionDepth) 59 } 60 61 return columnIndex, read 62 } 63 64 //go:noinline 65 func readRowsFuncOfRepeated(read readRowsFunc, repetitionDepth byte) readRowsFunc { 66 return func(r *rowGroupRows, rows []Row, repetitionLevel byte) (int, error) { 67 for i := range rows { 68 // Repeated columns have variable number of values, we must process 69 // them one row at a time because we cannot predict how many values 70 // need to be consumed in each iteration. 71 row := rows[i : i+1] 72 73 // The first pass looks for values marking the beginning of a row by 74 // having a repetition level equal to the current level. 75 n, err := read(r, row, repetitionLevel) 76 if err != nil { 77 // The error here may likely be io.EOF, the read function may 78 // also have successfully read a row, which is indicated by a 79 // non-zero count. In this case, we increment the index to 80 // indicate to the caller than rows up to i+1 have been read. 81 if n > 0 { 82 i++ 83 } 84 return i, err 85 } 86 87 // The read function may return no errors and also read no rows in 88 // case where it had more values to read but none corresponded to 89 // the current repetition level. This is an indication that we will 90 // not be able to read more rows at this stage, we must return to 91 // the caller to let it set the repetition level to its current 92 // depth, which may allow us to read more values when called again. 93 if n == 0 { 94 return i, nil 95 } 96 97 // When we reach this stage, we have successfully read the first 98 // values of a row of repeated columns. We continue consuming more 99 // repeated values until we get the indication that we consumed 100 // them all (the read function returns zero and no errors). 101 for { 102 n, err := read(r, row, repetitionDepth) 103 if err != nil { 104 return i + 1, err 105 } 106 if n == 0 { 107 break 108 } 109 } 110 } 111 return len(rows), nil 112 } 113 } 114 115 //go:noinline 116 func readRowsFuncOfGroup(node Node, columnIndex int, repetitionDepth byte) (int, readRowsFunc) { 117 fields := node.Fields() 118 119 if len(fields) == 0 { 120 return columnIndex, func(*rowGroupRows, []Row, byte) (int, error) { 121 return 0, io.EOF 122 } 123 } 124 125 if len(fields) == 1 { 126 // Small optimization for a somewhat common case of groups with a single 127 // column (like nested list elements for example); there is no need to 128 // loop over the group of a single element, we can simply skip to calling 129 // the inner read function. 130 return readRowsFuncOf(fields[0], columnIndex, repetitionDepth) 131 } 132 133 group := make([]readRowsFunc, len(fields)) 134 for i := range group { 135 columnIndex, group[i] = readRowsFuncOf(fields[i], columnIndex, repetitionDepth) 136 } 137 138 return columnIndex, func(r *rowGroupRows, rows []Row, repetitionLevel byte) (int, error) { 139 // When reading a group, we use the first column as an indicator of how 140 // may rows can be read during this call. 141 n, err := group[0](r, rows, repetitionLevel) 142 143 if n > 0 { 144 // Read values for all rows that the group is able to consume. 145 // Getting io.EOF from calling the read functions indicate that 146 // we consumed all values of that particular column, but there may 147 // be more to read in other columns, therefore we must always read 148 // all columns and cannot stop on the first error. 149 for _, read := range group[1:] { 150 _, err2 := read(r, rows[:n], repetitionLevel) 151 if err2 != nil && err2 != io.EOF { 152 return 0, err2 153 } 154 } 155 } 156 157 return n, err 158 } 159 } 160 161 //go:noinline 162 func readRowsFuncOfLeaf(columnIndex int, repetitionDepth byte) (int, readRowsFunc) { 163 var read readRowsFunc 164 165 if repetitionDepth == 0 { 166 read = func(r *rowGroupRows, rows []Row, _ byte) (int, error) { 167 // When the repetition depth is zero, we know that there is exactly 168 // one value per row for this column, and therefore we can consume 169 // as many values as there are rows to fill. 170 col := &r.columns[columnIndex] 171 buf := r.buffer(columnIndex) 172 173 for i := range rows { 174 if col.offset == col.length { 175 n, err := col.values.ReadValues(buf) 176 col.offset = 0 177 col.length = int32(n) 178 if n == 0 && err != nil { 179 return 0, err 180 } 181 } 182 183 rows[i] = append(rows[i], buf[col.offset]) 184 col.offset++ 185 } 186 187 return len(rows), nil 188 } 189 } else { 190 read = func(r *rowGroupRows, rows []Row, repetitionLevel byte) (int, error) { 191 // When the repetition depth is not zero, we know that we will be 192 // called with a single row as input. We attempt to read at most one 193 // value of a single row and return to the caller. 194 col := &r.columns[columnIndex] 195 buf := r.buffer(columnIndex) 196 197 if col.offset == col.length { 198 n, err := col.values.ReadValues(buf) 199 col.offset = 0 200 col.length = int32(n) 201 if n == 0 && err != nil { 202 return 0, err 203 } 204 } 205 206 if buf[col.offset].repetitionLevel != repetitionLevel { 207 return 0, nil 208 } 209 210 rows[0] = append(rows[0], buf[col.offset]) 211 col.offset++ 212 return 1, nil 213 } 214 } 215 216 return columnIndex + 1, read 217 }