github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/multi_row_group.go (about)

     1  package parquet
     2  
     3  import (
     4  	"io"
     5  )
     6  
     7  // MultiRowGroup wraps multiple row groups to appear as if it was a single
     8  // RowGroup. RowGroups must have the same schema or it will error.
     9  func MultiRowGroup(rowGroups ...RowGroup) RowGroup {
    10  	return newMultiRowGroup(ReadModeSync, rowGroups...)
    11  }
    12  
    13  func newMultiRowGroup(pageReadMode ReadMode, rowGroups ...RowGroup) RowGroup {
    14  	if len(rowGroups) == 0 {
    15  		return &emptyRowGroup{}
    16  	}
    17  	if len(rowGroups) == 1 {
    18  		return rowGroups[0]
    19  	}
    20  
    21  	schema, err := compatibleSchemaOf(rowGroups)
    22  	if err != nil {
    23  		panic(err)
    24  	}
    25  
    26  	rowGroupsCopy := make([]RowGroup, len(rowGroups))
    27  	copy(rowGroupsCopy, rowGroups)
    28  
    29  	c := &multiRowGroup{
    30  		pageReadMode: pageReadMode,
    31  	}
    32  	c.init(schema, rowGroupsCopy)
    33  	return c
    34  }
    35  
    36  func (c *multiRowGroup) init(schema *Schema, rowGroups []RowGroup) error {
    37  	columns := make([]multiColumnChunk, len(schema.Columns()))
    38  
    39  	rowGroupColumnChunks := make([][]ColumnChunk, len(rowGroups))
    40  	for i, rowGroup := range rowGroups {
    41  		rowGroupColumnChunks[i] = rowGroup.ColumnChunks()
    42  	}
    43  
    44  	for i := range columns {
    45  		columns[i].rowGroup = c
    46  		columns[i].column = i
    47  		columns[i].chunks = make([]ColumnChunk, len(rowGroupColumnChunks))
    48  
    49  		for j, columnChunks := range rowGroupColumnChunks {
    50  			columns[i].chunks[j] = columnChunks[i]
    51  		}
    52  	}
    53  
    54  	c.schema = schema
    55  	c.rowGroups = rowGroups
    56  	c.columns = make([]ColumnChunk, len(columns))
    57  
    58  	for i := range columns {
    59  		c.columns[i] = &columns[i]
    60  	}
    61  
    62  	return nil
    63  }
    64  
    65  func compatibleSchemaOf(rowGroups []RowGroup) (*Schema, error) {
    66  	schema := rowGroups[0].Schema()
    67  
    68  	// Fast path: Many times all row groups have the exact same schema so a
    69  	// pointer comparison is cheaper.
    70  	samePointer := true
    71  	for _, rowGroup := range rowGroups[1:] {
    72  		if rowGroup.Schema() != schema {
    73  			samePointer = false
    74  			break
    75  		}
    76  	}
    77  	if samePointer {
    78  		return schema, nil
    79  	}
    80  
    81  	// Slow path: The schema pointers are not the same, but they still have to
    82  	// be compatible.
    83  	for _, rowGroup := range rowGroups[1:] {
    84  		if !nodesAreEqual(schema, rowGroup.Schema()) {
    85  			return nil, ErrRowGroupSchemaMismatch
    86  		}
    87  	}
    88  
    89  	return schema, nil
    90  }
    91  
    92  type multiRowGroup struct {
    93  	schema       *Schema
    94  	rowGroups    []RowGroup
    95  	columns      []ColumnChunk
    96  	pageReadMode ReadMode
    97  }
    98  
    99  func (c *multiRowGroup) NumRows() (numRows int64) {
   100  	for _, rowGroup := range c.rowGroups {
   101  		numRows += rowGroup.NumRows()
   102  	}
   103  	return numRows
   104  }
   105  
   106  func (c *multiRowGroup) ColumnChunks() []ColumnChunk { return c.columns }
   107  
   108  func (c *multiRowGroup) SortingColumns() []SortingColumn { return nil }
   109  
   110  func (c *multiRowGroup) Schema() *Schema { return c.schema }
   111  
   112  func (c *multiRowGroup) Rows() Rows { return newRowGroupRows(c, c.pageReadMode) }
   113  
   114  type multiColumnChunk struct {
   115  	rowGroup *multiRowGroup
   116  	column   int
   117  	chunks   []ColumnChunk
   118  }
   119  
   120  func (c *multiColumnChunk) Type() Type {
   121  	if len(c.chunks) != 0 {
   122  		return c.chunks[0].Type() // all chunks should be of the same type
   123  	}
   124  	return nil
   125  }
   126  
   127  func (c *multiColumnChunk) NumValues() int64 {
   128  	n := int64(0)
   129  	for i := range c.chunks {
   130  		n += c.chunks[i].NumValues()
   131  	}
   132  	return n
   133  }
   134  
   135  func (c *multiColumnChunk) Column() int {
   136  	return c.column
   137  }
   138  
   139  func (c *multiColumnChunk) Pages() Pages {
   140  	return &multiPages{column: c}
   141  }
   142  
   143  func (c *multiColumnChunk) ColumnIndex() ColumnIndex {
   144  	// TODO: implement
   145  	return nil
   146  }
   147  
   148  func (c *multiColumnChunk) OffsetIndex() OffsetIndex {
   149  	// TODO: implement
   150  	return nil
   151  }
   152  
   153  func (c *multiColumnChunk) BloomFilter() BloomFilter {
   154  	return multiBloomFilter{c}
   155  }
   156  
   157  type multiBloomFilter struct{ *multiColumnChunk }
   158  
   159  func (f multiBloomFilter) ReadAt(b []byte, off int64) (int, error) {
   160  	// TODO: add a test for this function
   161  	i := 0
   162  
   163  	for i < len(f.chunks) {
   164  		if r := f.chunks[i].BloomFilter(); r != nil {
   165  			size := r.Size()
   166  			if off < size {
   167  				break
   168  			}
   169  			off -= size
   170  		}
   171  		i++
   172  	}
   173  
   174  	if i == len(f.chunks) {
   175  		return 0, io.EOF
   176  	}
   177  
   178  	rn := int(0)
   179  	for len(b) > 0 {
   180  		if r := f.chunks[i].BloomFilter(); r != nil {
   181  			n, err := r.ReadAt(b, off)
   182  			rn += n
   183  			if err != nil {
   184  				return rn, err
   185  			}
   186  			if b = b[n:]; len(b) == 0 {
   187  				return rn, nil
   188  			}
   189  			off += int64(n)
   190  		}
   191  		i++
   192  	}
   193  
   194  	if i == len(f.chunks) {
   195  		return rn, io.EOF
   196  	}
   197  	return rn, nil
   198  }
   199  
   200  func (f multiBloomFilter) Size() int64 {
   201  	size := int64(0)
   202  	for _, c := range f.chunks {
   203  		if b := c.BloomFilter(); b != nil {
   204  			size += b.Size()
   205  		}
   206  	}
   207  	return size
   208  }
   209  
   210  func (f multiBloomFilter) Check(v Value) (bool, error) {
   211  	for _, c := range f.chunks {
   212  		if b := c.BloomFilter(); b != nil {
   213  			if ok, err := b.Check(v); ok || err != nil {
   214  				return ok, err
   215  			}
   216  		}
   217  	}
   218  	return false, nil
   219  }
   220  
   221  type multiPages struct {
   222  	pages  Pages
   223  	index  int
   224  	column *multiColumnChunk
   225  }
   226  
   227  func (m *multiPages) ReadPage() (Page, error) {
   228  	for {
   229  		if m.pages != nil {
   230  			p, err := m.pages.ReadPage()
   231  			if err == nil || err != io.EOF {
   232  				return p, err
   233  			}
   234  			if err := m.pages.Close(); err != nil {
   235  				return nil, err
   236  			}
   237  			m.pages = nil
   238  		}
   239  
   240  		if m.column == nil || m.index == len(m.column.chunks) {
   241  			return nil, io.EOF
   242  		}
   243  
   244  		m.pages = m.column.chunks[m.index].Pages()
   245  		m.index++
   246  	}
   247  }
   248  
   249  func (m *multiPages) SeekToRow(rowIndex int64) error {
   250  	if m.column == nil {
   251  		return io.ErrClosedPipe
   252  	}
   253  
   254  	if m.pages != nil {
   255  		if err := m.pages.Close(); err != nil {
   256  			return err
   257  		}
   258  	}
   259  
   260  	rowGroups := m.column.rowGroup.rowGroups
   261  	numRows := int64(0)
   262  	m.pages = nil
   263  	m.index = 0
   264  
   265  	for m.index < len(rowGroups) {
   266  		numRows = rowGroups[m.index].NumRows()
   267  		if rowIndex < numRows {
   268  			break
   269  		}
   270  		rowIndex -= numRows
   271  		m.index++
   272  	}
   273  
   274  	if m.index < len(rowGroups) {
   275  		m.pages = m.column.chunks[m.index].Pages()
   276  		m.index++
   277  		return m.pages.SeekToRow(rowIndex)
   278  	}
   279  	return nil
   280  }
   281  
   282  func (m *multiPages) Close() (err error) {
   283  	if m.pages != nil {
   284  		err = m.pages.Close()
   285  	}
   286  	m.pages = nil
   287  	m.index = 0
   288  	m.column = nil
   289  	return err
   290  }