github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/multi_row_group.go (about)

     1  package parquet
     2  
     3  import (
     4  	"io"
     5  )
     6  
     7  // MultiRowGroup wraps multiple row groups to appear as if it was a single
     8  // RowGroup. RowGroups must have the same schema or it will error.
     9  func MultiRowGroup(rowGroups ...RowGroup) RowGroup {
    10  	if len(rowGroups) == 0 {
    11  		return &emptyRowGroup{}
    12  	}
    13  	if len(rowGroups) == 1 {
    14  		return rowGroups[0]
    15  	}
    16  
    17  	schema, err := compatibleSchemaOf(rowGroups)
    18  	if err != nil {
    19  		panic(err)
    20  	}
    21  
    22  	rowGroupsCopy := make([]RowGroup, len(rowGroups))
    23  	copy(rowGroupsCopy, rowGroups)
    24  
    25  	c := new(multiRowGroup)
    26  	c.init(schema, rowGroupsCopy)
    27  	return c
    28  }
    29  
    30  func (c *multiRowGroup) init(schema *Schema, rowGroups []RowGroup) error {
    31  	columns := make([]multiColumnChunk, len(schema.Columns()))
    32  
    33  	rowGroupColumnChunks := make([][]ColumnChunk, len(rowGroups))
    34  	for i, rowGroup := range rowGroups {
    35  		rowGroupColumnChunks[i] = rowGroup.ColumnChunks()
    36  	}
    37  
    38  	for i := range columns {
    39  		columns[i].rowGroup = c
    40  		columns[i].column = i
    41  		columns[i].chunks = make([]ColumnChunk, len(rowGroupColumnChunks))
    42  
    43  		for j, columnChunks := range rowGroupColumnChunks {
    44  			columns[i].chunks[j] = columnChunks[i]
    45  		}
    46  	}
    47  
    48  	c.schema = schema
    49  	c.rowGroups = rowGroups
    50  	c.columns = make([]ColumnChunk, len(columns))
    51  
    52  	for i := range columns {
    53  		c.columns[i] = &columns[i]
    54  	}
    55  
    56  	return nil
    57  }
    58  
    59  func compatibleSchemaOf(rowGroups []RowGroup) (*Schema, error) {
    60  	schema := rowGroups[0].Schema()
    61  
    62  	// Fast path: Many times all row groups have the exact same schema so a
    63  	// pointer comparison is cheaper.
    64  	samePointer := true
    65  	for _, rowGroup := range rowGroups[1:] {
    66  		if rowGroup.Schema() != schema {
    67  			samePointer = false
    68  			break
    69  		}
    70  	}
    71  	if samePointer {
    72  		return schema, nil
    73  	}
    74  
    75  	// Slow path: The schema pointers are not the same, but they still have to
    76  	// be compatible.
    77  	for _, rowGroup := range rowGroups[1:] {
    78  		if !nodesAreEqual(schema, rowGroup.Schema()) {
    79  			return nil, ErrRowGroupSchemaMismatch
    80  		}
    81  	}
    82  
    83  	return schema, nil
    84  }
    85  
    86  type multiRowGroup struct {
    87  	schema    *Schema
    88  	rowGroups []RowGroup
    89  	columns   []ColumnChunk
    90  }
    91  
    92  func (c *multiRowGroup) NumRows() (numRows int64) {
    93  	for _, rowGroup := range c.rowGroups {
    94  		numRows += rowGroup.NumRows()
    95  	}
    96  	return numRows
    97  }
    98  
    99  func (c *multiRowGroup) ColumnChunks() []ColumnChunk { return c.columns }
   100  
   101  func (c *multiRowGroup) SortingColumns() []SortingColumn { return nil }
   102  
   103  func (c *multiRowGroup) Schema() *Schema { return c.schema }
   104  
   105  func (c *multiRowGroup) Rows() Rows { return &rowGroupRows{rowGroup: c} }
   106  
   107  type multiColumnChunk struct {
   108  	rowGroup *multiRowGroup
   109  	column   int
   110  	chunks   []ColumnChunk
   111  }
   112  
   113  func (c *multiColumnChunk) Type() Type {
   114  	if len(c.chunks) != 0 {
   115  		return c.chunks[0].Type() // all chunks should be of the same type
   116  	}
   117  	return nil
   118  }
   119  
   120  func (c *multiColumnChunk) NumValues() int64 {
   121  	n := int64(0)
   122  	for i := range c.chunks {
   123  		n += c.chunks[i].NumValues()
   124  	}
   125  	return n
   126  }
   127  
   128  func (c *multiColumnChunk) Column() int {
   129  	return c.column
   130  }
   131  
   132  func (c *multiColumnChunk) Pages() Pages {
   133  	return &multiPages{column: c}
   134  }
   135  
   136  func (c *multiColumnChunk) ColumnIndex() ColumnIndex {
   137  	// TODO: implement
   138  	return nil
   139  }
   140  
   141  func (c *multiColumnChunk) OffsetIndex() OffsetIndex {
   142  	// TODO: implement
   143  	return nil
   144  }
   145  
   146  func (c *multiColumnChunk) BloomFilter() BloomFilter {
   147  	return multiBloomFilter{c}
   148  }
   149  
   150  type multiBloomFilter struct{ *multiColumnChunk }
   151  
   152  func (f multiBloomFilter) ReadAt(b []byte, off int64) (int, error) {
   153  	// TODO: add a test for this function
   154  	i := 0
   155  
   156  	for i < len(f.chunks) {
   157  		if r := f.chunks[i].BloomFilter(); r != nil {
   158  			size := r.Size()
   159  			if off < size {
   160  				break
   161  			}
   162  			off -= size
   163  		}
   164  		i++
   165  	}
   166  
   167  	if i == len(f.chunks) {
   168  		return 0, io.EOF
   169  	}
   170  
   171  	rn := int(0)
   172  	for len(b) > 0 {
   173  		if r := f.chunks[i].BloomFilter(); r != nil {
   174  			n, err := r.ReadAt(b, off)
   175  			rn += n
   176  			if err != nil {
   177  				return rn, err
   178  			}
   179  			if b = b[n:]; len(b) == 0 {
   180  				return rn, nil
   181  			}
   182  			off += int64(n)
   183  		}
   184  		i++
   185  	}
   186  
   187  	if i == len(f.chunks) {
   188  		return rn, io.EOF
   189  	}
   190  	return rn, nil
   191  }
   192  
   193  func (f multiBloomFilter) Size() int64 {
   194  	size := int64(0)
   195  	for _, c := range f.chunks {
   196  		if b := c.BloomFilter(); b != nil {
   197  			size += b.Size()
   198  		}
   199  	}
   200  	return size
   201  }
   202  
   203  func (f multiBloomFilter) Check(v Value) (bool, error) {
   204  	for _, c := range f.chunks {
   205  		if b := c.BloomFilter(); b != nil {
   206  			if ok, err := b.Check(v); ok || err != nil {
   207  				return ok, err
   208  			}
   209  		}
   210  	}
   211  	return false, nil
   212  }
   213  
   214  type multiPages struct {
   215  	pages  Pages
   216  	index  int
   217  	column *multiColumnChunk
   218  }
   219  
   220  func (m *multiPages) ReadPage() (Page, error) {
   221  	for {
   222  		if m.pages != nil {
   223  			p, err := m.pages.ReadPage()
   224  			if err == nil || err != io.EOF {
   225  				return p, err
   226  			}
   227  			if err := m.pages.Close(); err != nil {
   228  				return nil, err
   229  			}
   230  			m.pages = nil
   231  		}
   232  
   233  		if m.column == nil || m.index == len(m.column.chunks) {
   234  			return nil, io.EOF
   235  		}
   236  
   237  		m.pages = m.column.chunks[m.index].Pages()
   238  		m.index++
   239  	}
   240  }
   241  
   242  func (m *multiPages) SeekToRow(rowIndex int64) error {
   243  	if m.column == nil {
   244  		return io.ErrClosedPipe
   245  	}
   246  
   247  	if m.pages != nil {
   248  		if err := m.pages.Close(); err != nil {
   249  			return err
   250  		}
   251  	}
   252  
   253  	rowGroups := m.column.rowGroup.rowGroups
   254  	numRows := int64(0)
   255  	m.pages = nil
   256  	m.index = 0
   257  
   258  	for m.index < len(rowGroups) {
   259  		numRows = rowGroups[m.index].NumRows()
   260  		if rowIndex < numRows {
   261  			break
   262  		}
   263  		rowIndex -= numRows
   264  		m.index++
   265  	}
   266  
   267  	if m.index < len(rowGroups) {
   268  		m.pages = m.column.chunks[m.index].Pages()
   269  		m.index++
   270  		return m.pages.SeekToRow(rowIndex)
   271  	}
   272  	return nil
   273  }
   274  
   275  func (m *multiPages) Close() (err error) {
   276  	if m.pages != nil {
   277  		err = m.pages.Close()
   278  	}
   279  	m.pages = nil
   280  	m.index = 0
   281  	m.column = nil
   282  	return err
   283  }