github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/multi_row_group.go (about) 1 package parquet 2 3 import ( 4 "io" 5 ) 6 7 // MultiRowGroup wraps multiple row groups to appear as if it was a single 8 // RowGroup. RowGroups must have the same schema or it will error. 9 func MultiRowGroup(rowGroups ...RowGroup) RowGroup { 10 return newMultiRowGroup(ReadModeSync, rowGroups...) 11 } 12 13 func newMultiRowGroup(pageReadMode ReadMode, rowGroups ...RowGroup) RowGroup { 14 if len(rowGroups) == 0 { 15 return &emptyRowGroup{} 16 } 17 if len(rowGroups) == 1 { 18 return rowGroups[0] 19 } 20 21 schema, err := compatibleSchemaOf(rowGroups) 22 if err != nil { 23 panic(err) 24 } 25 26 rowGroupsCopy := make([]RowGroup, len(rowGroups)) 27 copy(rowGroupsCopy, rowGroups) 28 29 c := &multiRowGroup{ 30 pageReadMode: pageReadMode, 31 } 32 c.init(schema, rowGroupsCopy) 33 return c 34 } 35 36 func (c *multiRowGroup) init(schema *Schema, rowGroups []RowGroup) error { 37 columns := make([]multiColumnChunk, len(schema.Columns())) 38 39 rowGroupColumnChunks := make([][]ColumnChunk, len(rowGroups)) 40 for i, rowGroup := range rowGroups { 41 rowGroupColumnChunks[i] = rowGroup.ColumnChunks() 42 } 43 44 for i := range columns { 45 columns[i].rowGroup = c 46 columns[i].column = i 47 columns[i].chunks = make([]ColumnChunk, len(rowGroupColumnChunks)) 48 49 for j, columnChunks := range rowGroupColumnChunks { 50 columns[i].chunks[j] = columnChunks[i] 51 } 52 } 53 54 c.schema = schema 55 c.rowGroups = rowGroups 56 c.columns = make([]ColumnChunk, len(columns)) 57 58 for i := range columns { 59 c.columns[i] = &columns[i] 60 } 61 62 return nil 63 } 64 65 func compatibleSchemaOf(rowGroups []RowGroup) (*Schema, error) { 66 schema := rowGroups[0].Schema() 67 68 // Fast path: Many times all row groups have the exact same schema so a 69 // pointer comparison is cheaper. 70 samePointer := true 71 for _, rowGroup := range rowGroups[1:] { 72 if rowGroup.Schema() != schema { 73 samePointer = false 74 break 75 } 76 } 77 if samePointer { 78 return schema, nil 79 } 80 81 // Slow path: The schema pointers are not the same, but they still have to 82 // be compatible. 83 for _, rowGroup := range rowGroups[1:] { 84 if !nodesAreEqual(schema, rowGroup.Schema()) { 85 return nil, ErrRowGroupSchemaMismatch 86 } 87 } 88 89 return schema, nil 90 } 91 92 type multiRowGroup struct { 93 schema *Schema 94 rowGroups []RowGroup 95 columns []ColumnChunk 96 pageReadMode ReadMode 97 } 98 99 func (c *multiRowGroup) NumRows() (numRows int64) { 100 for _, rowGroup := range c.rowGroups { 101 numRows += rowGroup.NumRows() 102 } 103 return numRows 104 } 105 106 func (c *multiRowGroup) ColumnChunks() []ColumnChunk { return c.columns } 107 108 func (c *multiRowGroup) SortingColumns() []SortingColumn { return nil } 109 110 func (c *multiRowGroup) Schema() *Schema { return c.schema } 111 112 func (c *multiRowGroup) Rows() Rows { return newRowGroupRows(c, c.pageReadMode) } 113 114 type multiColumnChunk struct { 115 rowGroup *multiRowGroup 116 column int 117 chunks []ColumnChunk 118 } 119 120 func (c *multiColumnChunk) Type() Type { 121 if len(c.chunks) != 0 { 122 return c.chunks[0].Type() // all chunks should be of the same type 123 } 124 return nil 125 } 126 127 func (c *multiColumnChunk) NumValues() int64 { 128 n := int64(0) 129 for i := range c.chunks { 130 n += c.chunks[i].NumValues() 131 } 132 return n 133 } 134 135 func (c *multiColumnChunk) Column() int { 136 return c.column 137 } 138 139 func (c *multiColumnChunk) Pages() Pages { 140 return &multiPages{column: c} 141 } 142 143 func (c *multiColumnChunk) ColumnIndex() (ColumnIndex, error) { 144 // TODO: implement 145 return nil, nil 146 } 147 148 func (c *multiColumnChunk) OffsetIndex() (OffsetIndex, error) { 149 // TODO: implement 150 return nil, nil 151 } 152 153 func (c *multiColumnChunk) BloomFilter() BloomFilter { 154 return multiBloomFilter{c} 155 } 156 157 type multiBloomFilter struct{ *multiColumnChunk } 158 159 func (f multiBloomFilter) ReadAt(b []byte, off int64) (int, error) { 160 // TODO: add a test for this function 161 i := 0 162 163 for i < len(f.chunks) { 164 if r := f.chunks[i].BloomFilter(); r != nil { 165 size := r.Size() 166 if off < size { 167 break 168 } 169 off -= size 170 } 171 i++ 172 } 173 174 if i == len(f.chunks) { 175 return 0, io.EOF 176 } 177 178 rn := int(0) 179 for len(b) > 0 { 180 if r := f.chunks[i].BloomFilter(); r != nil { 181 n, err := r.ReadAt(b, off) 182 rn += n 183 if err != nil { 184 return rn, err 185 } 186 if b = b[n:]; len(b) == 0 { 187 return rn, nil 188 } 189 off += int64(n) 190 } 191 i++ 192 } 193 194 if i == len(f.chunks) { 195 return rn, io.EOF 196 } 197 return rn, nil 198 } 199 200 func (f multiBloomFilter) Size() int64 { 201 size := int64(0) 202 for _, c := range f.chunks { 203 if b := c.BloomFilter(); b != nil { 204 size += b.Size() 205 } 206 } 207 return size 208 } 209 210 func (f multiBloomFilter) Check(v Value) (bool, error) { 211 for _, c := range f.chunks { 212 if b := c.BloomFilter(); b != nil { 213 if ok, err := b.Check(v); ok || err != nil { 214 return ok, err 215 } 216 } 217 } 218 return false, nil 219 } 220 221 type multiPages struct { 222 pages Pages 223 index int 224 column *multiColumnChunk 225 } 226 227 func (m *multiPages) ReadPage() (Page, error) { 228 for { 229 if m.pages != nil { 230 p, err := m.pages.ReadPage() 231 if err == nil || err != io.EOF { 232 return p, err 233 } 234 if err := m.pages.Close(); err != nil { 235 return nil, err 236 } 237 m.pages = nil 238 } 239 240 if m.column == nil || m.index == len(m.column.chunks) { 241 return nil, io.EOF 242 } 243 244 m.pages = m.column.chunks[m.index].Pages() 245 m.index++ 246 } 247 } 248 249 func (m *multiPages) SeekToRow(rowIndex int64) error { 250 if m.column == nil { 251 return io.ErrClosedPipe 252 } 253 254 if m.pages != nil { 255 if err := m.pages.Close(); err != nil { 256 return err 257 } 258 } 259 260 rowGroups := m.column.rowGroup.rowGroups 261 numRows := int64(0) 262 m.pages = nil 263 m.index = 0 264 265 for m.index < len(rowGroups) { 266 numRows = rowGroups[m.index].NumRows() 267 if rowIndex < numRows { 268 break 269 } 270 rowIndex -= numRows 271 m.index++ 272 } 273 274 if m.index < len(rowGroups) { 275 m.pages = m.column.chunks[m.index].Pages() 276 m.index++ 277 return m.pages.SeekToRow(rowIndex) 278 } 279 return nil 280 } 281 282 func (m *multiPages) Close() (err error) { 283 if m.pages != nil { 284 err = m.pages.Close() 285 } 286 m.pages = nil 287 m.index = 0 288 m.column = nil 289 return err 290 }