github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/multi_row_group.go (about) 1 package parquet 2 3 import ( 4 "io" 5 ) 6 7 // MultiRowGroup wraps multiple row groups to appear as if it was a single 8 // RowGroup. RowGroups must have the same schema or it will error. 9 func MultiRowGroup(rowGroups ...RowGroup) RowGroup { 10 if len(rowGroups) == 0 { 11 return &emptyRowGroup{} 12 } 13 if len(rowGroups) == 1 { 14 return rowGroups[0] 15 } 16 17 schema, err := compatibleSchemaOf(rowGroups) 18 if err != nil { 19 panic(err) 20 } 21 22 rowGroupsCopy := make([]RowGroup, len(rowGroups)) 23 copy(rowGroupsCopy, rowGroups) 24 25 c := new(multiRowGroup) 26 c.init(schema, rowGroupsCopy) 27 return c 28 } 29 30 func (c *multiRowGroup) init(schema *Schema, rowGroups []RowGroup) error { 31 columns := make([]multiColumnChunk, len(schema.Columns())) 32 33 rowGroupColumnChunks := make([][]ColumnChunk, len(rowGroups)) 34 for i, rowGroup := range rowGroups { 35 rowGroupColumnChunks[i] = rowGroup.ColumnChunks() 36 } 37 38 for i := range columns { 39 columns[i].rowGroup = c 40 columns[i].column = i 41 columns[i].chunks = make([]ColumnChunk, len(rowGroupColumnChunks)) 42 43 for j, columnChunks := range rowGroupColumnChunks { 44 columns[i].chunks[j] = columnChunks[i] 45 } 46 } 47 48 c.schema = schema 49 c.rowGroups = rowGroups 50 c.columns = make([]ColumnChunk, len(columns)) 51 52 for i := range columns { 53 c.columns[i] = &columns[i] 54 } 55 56 return nil 57 } 58 59 func compatibleSchemaOf(rowGroups []RowGroup) (*Schema, error) { 60 schema := rowGroups[0].Schema() 61 62 // Fast path: Many times all row groups have the exact same schema so a 63 // pointer comparison is cheaper. 64 samePointer := true 65 for _, rowGroup := range rowGroups[1:] { 66 if rowGroup.Schema() != schema { 67 samePointer = false 68 break 69 } 70 } 71 if samePointer { 72 return schema, nil 73 } 74 75 // Slow path: The schema pointers are not the same, but they still have to 76 // be compatible. 77 for _, rowGroup := range rowGroups[1:] { 78 if !nodesAreEqual(schema, rowGroup.Schema()) { 79 return nil, ErrRowGroupSchemaMismatch 80 } 81 } 82 83 return schema, nil 84 } 85 86 type multiRowGroup struct { 87 schema *Schema 88 rowGroups []RowGroup 89 columns []ColumnChunk 90 } 91 92 func (c *multiRowGroup) NumRows() (numRows int64) { 93 for _, rowGroup := range c.rowGroups { 94 numRows += rowGroup.NumRows() 95 } 96 return numRows 97 } 98 99 func (c *multiRowGroup) ColumnChunks() []ColumnChunk { return c.columns } 100 101 func (c *multiRowGroup) SortingColumns() []SortingColumn { return nil } 102 103 func (c *multiRowGroup) Schema() *Schema { return c.schema } 104 105 func (c *multiRowGroup) Rows() Rows { return &rowGroupRows{rowGroup: c} } 106 107 type multiColumnChunk struct { 108 rowGroup *multiRowGroup 109 column int 110 chunks []ColumnChunk 111 } 112 113 func (c *multiColumnChunk) Type() Type { 114 if len(c.chunks) != 0 { 115 return c.chunks[0].Type() // all chunks should be of the same type 116 } 117 return nil 118 } 119 120 func (c *multiColumnChunk) NumValues() int64 { 121 n := int64(0) 122 for i := range c.chunks { 123 n += c.chunks[i].NumValues() 124 } 125 return n 126 } 127 128 func (c *multiColumnChunk) Column() int { 129 return c.column 130 } 131 132 func (c *multiColumnChunk) Pages() Pages { 133 return &multiPages{column: c} 134 } 135 136 func (c *multiColumnChunk) ColumnIndex() ColumnIndex { 137 // TODO: implement 138 return nil 139 } 140 141 func (c *multiColumnChunk) OffsetIndex() OffsetIndex { 142 // TODO: implement 143 return nil 144 } 145 146 func (c *multiColumnChunk) BloomFilter() BloomFilter { 147 return multiBloomFilter{c} 148 } 149 150 type multiBloomFilter struct{ *multiColumnChunk } 151 152 func (f multiBloomFilter) ReadAt(b []byte, off int64) (int, error) { 153 // TODO: add a test for this function 154 i := 0 155 156 for i < len(f.chunks) { 157 if r := f.chunks[i].BloomFilter(); r != nil { 158 size := r.Size() 159 if off < size { 160 break 161 } 162 off -= size 163 } 164 i++ 165 } 166 167 if i == len(f.chunks) { 168 return 0, io.EOF 169 } 170 171 rn := int(0) 172 for len(b) > 0 { 173 if r := f.chunks[i].BloomFilter(); r != nil { 174 n, err := r.ReadAt(b, off) 175 rn += n 176 if err != nil { 177 return rn, err 178 } 179 if b = b[n:]; len(b) == 0 { 180 return rn, nil 181 } 182 off += int64(n) 183 } 184 i++ 185 } 186 187 if i == len(f.chunks) { 188 return rn, io.EOF 189 } 190 return rn, nil 191 } 192 193 func (f multiBloomFilter) Size() int64 { 194 size := int64(0) 195 for _, c := range f.chunks { 196 if b := c.BloomFilter(); b != nil { 197 size += b.Size() 198 } 199 } 200 return size 201 } 202 203 func (f multiBloomFilter) Check(v Value) (bool, error) { 204 for _, c := range f.chunks { 205 if b := c.BloomFilter(); b != nil { 206 if ok, err := b.Check(v); ok || err != nil { 207 return ok, err 208 } 209 } 210 } 211 return false, nil 212 } 213 214 type multiPages struct { 215 pages Pages 216 index int 217 column *multiColumnChunk 218 } 219 220 func (m *multiPages) ReadPage() (Page, error) { 221 for { 222 if m.pages != nil { 223 p, err := m.pages.ReadPage() 224 if err == nil || err != io.EOF { 225 return p, err 226 } 227 if err := m.pages.Close(); err != nil { 228 return nil, err 229 } 230 m.pages = nil 231 } 232 233 if m.column == nil || m.index == len(m.column.chunks) { 234 return nil, io.EOF 235 } 236 237 m.pages = m.column.chunks[m.index].Pages() 238 m.index++ 239 } 240 } 241 242 func (m *multiPages) SeekToRow(rowIndex int64) error { 243 if m.column == nil { 244 return io.ErrClosedPipe 245 } 246 247 if m.pages != nil { 248 if err := m.pages.Close(); err != nil { 249 return err 250 } 251 } 252 253 rowGroups := m.column.rowGroup.rowGroups 254 numRows := int64(0) 255 m.pages = nil 256 m.index = 0 257 258 for m.index < len(rowGroups) { 259 numRows = rowGroups[m.index].NumRows() 260 if rowIndex < numRows { 261 break 262 } 263 rowIndex -= numRows 264 m.index++ 265 } 266 267 if m.index < len(rowGroups) { 268 m.pages = m.column.chunks[m.index].Pages() 269 m.index++ 270 return m.pages.SeekToRow(rowIndex) 271 } 272 return nil 273 } 274 275 func (m *multiPages) Close() (err error) { 276 if m.pages != nil { 277 err = m.pages.Close() 278 } 279 m.pages = nil 280 m.index = 0 281 m.column = nil 282 return err 283 }