github.com/parquet-go/parquet-go@v0.20.0/sorting.go

github.com/parquet-go/parquet-go@v0.20.0/sorting.go (about)

     1  package parquet
     2  
     3  import (
     4  	"io"
     5  	"sort"
     6  )
     7  
     8  // SortingWriter is a type similar to GenericWriter but it ensures that rows
     9  // are sorted according to the sorting columns configured on the writer.
    10  //
    11  // The writer accumulates rows in an in-memory buffer which is sorted when it
    12  // reaches the target number of rows, then written to a temporary row group.
    13  // When the writer is flushed or closed, the temporary row groups are merged
    14  // into a row group in the output file, ensuring that rows remain sorted in the
    15  // final row group.
    16  //
    17  // Because row groups get encoded and compressed, they hold a lot less memory
    18  // than if all rows were retained in memory. Sorting then merging rows chunks
    19  // also tends to be a lot more efficient than sorting all rows in memory as it
    20  // results in better CPU cache utilization since sorting multi-megabyte arrays
    21  // causes a lot of cache misses since the data set cannot be held in CPU caches.
    22  type SortingWriter[T any] struct {
    23  	rowbuf  *RowBuffer[T]
    24  	writer  *GenericWriter[T]
    25  	output  *GenericWriter[T]
    26  	buffer  io.ReadWriteSeeker
    27  	maxRows int64
    28  	numRows int64
    29  	sorting SortingConfig
    30  	dedupe  dedupe
    31  }
    32  
    33  // NewSortingWriter constructs a new sorting writer which writes a parquet file
    34  // where rows of each row group are ordered according to the sorting columns
    35  // configured on the writer.
    36  //
    37  // The sortRowCount argument defines the target number of rows that will be
    38  // sorted in memory before being written to temporary row groups. The greater
    39  // this value the more memory is needed to buffer rows in memory. Choosing a
    40  // value that is too small limits the maximum number of rows that can exist in
    41  // the output file since the writer cannot create more than 32K temporary row
    42  // groups to hold the sorted row chunks.
    43  func NewSortingWriter[T any](output io.Writer, sortRowCount int64, options ...WriterOption) *SortingWriter[T] {
    44  	config, err := NewWriterConfig(options...)
    45  	if err != nil {
    46  		panic(err)
    47  	}
    48  	return &SortingWriter[T]{
    49  		rowbuf: NewRowBuffer[T](&RowGroupConfig{
    50  			Schema:  config.Schema,
    51  			Sorting: config.Sorting,
    52  		}),
    53  		writer: NewGenericWriter[T](io.Discard, &WriterConfig{
    54  			CreatedBy:            config.CreatedBy,
    55  			ColumnPageBuffers:    config.ColumnPageBuffers,
    56  			ColumnIndexSizeLimit: config.ColumnIndexSizeLimit,
    57  			PageBufferSize:       config.PageBufferSize,
    58  			WriteBufferSize:      config.WriteBufferSize,
    59  			DataPageVersion:      config.DataPageVersion,
    60  			Schema:               config.Schema,
    61  			Compression:          config.Compression,
    62  			Sorting:              config.Sorting,
    63  		}),
    64  		output:  NewGenericWriter[T](output, config),
    65  		maxRows: sortRowCount,
    66  		sorting: config.Sorting,
    67  	}
    68  }
    69  
    70  func (w *SortingWriter[T]) Close() error {
    71  	if err := w.Flush(); err != nil {
    72  		return err
    73  	}
    74  	return w.output.Close()
    75  }
    76  
    77  func (w *SortingWriter[T]) Flush() error {
    78  	defer w.resetSortingBuffer()
    79  
    80  	if err := w.sortAndWriteBufferedRows(); err != nil {
    81  		return err
    82  	}
    83  
    84  	if w.numRows == 0 {
    85  		return nil
    86  	}
    87  
    88  	if err := w.writer.Close(); err != nil {
    89  		return err
    90  	}
    91  
    92  	size, err := w.buffer.Seek(0, io.SeekCurrent)
    93  	if err != nil {
    94  		return err
    95  	}
    96  
    97  	f, err := OpenFile(newReaderAt(w.buffer), size,
    98  		&FileConfig{
    99  			SkipPageIndex:    true,
   100  			SkipBloomFilters: true,
   101  			ReadBufferSize:   defaultReadBufferSize,
   102  		},
   103  	)
   104  	if err != nil {
   105  		return err
   106  	}
   107  
   108  	m, err := MergeRowGroups(f.RowGroups(),
   109  		&RowGroupConfig{
   110  			Schema:  w.Schema(),
   111  			Sorting: w.sorting,
   112  		},
   113  	)
   114  	if err != nil {
   115  		return err
   116  	}
   117  
   118  	rows := m.Rows()
   119  	defer rows.Close()
   120  
   121  	reader := RowReader(rows)
   122  	if w.sorting.DropDuplicatedRows {
   123  		reader = DedupeRowReader(rows, w.rowbuf.compare)
   124  	}
   125  
   126  	if _, err := CopyRows(w.output, reader); err != nil {
   127  		return err
   128  	}
   129  
   130  	return w.output.Flush()
   131  }
   132  
   133  func (w *SortingWriter[T]) Reset(output io.Writer) {
   134  	w.output.Reset(output)
   135  	w.rowbuf.Reset()
   136  	w.resetSortingBuffer()
   137  }
   138  
   139  func (w *SortingWriter[T]) resetSortingBuffer() {
   140  	w.writer.Reset(io.Discard)
   141  	w.numRows = 0
   142  
   143  	if w.buffer != nil {
   144  		w.sorting.SortingBuffers.PutBuffer(w.buffer)
   145  		w.buffer = nil
   146  	}
   147  }
   148  
   149  func (w *SortingWriter[T]) Write(rows []T) (int, error) {
   150  	return w.writeRows(len(rows), func(i, j int) (int, error) { return w.rowbuf.Write(rows[i:j]) })
   151  }
   152  
   153  func (w *SortingWriter[T]) WriteRows(rows []Row) (int, error) {
   154  	return w.writeRows(len(rows), func(i, j int) (int, error) { return w.rowbuf.WriteRows(rows[i:j]) })
   155  }
   156  
   157  func (w *SortingWriter[T]) writeRows(numRows int, writeRows func(i, j int) (int, error)) (int, error) {
   158  	wn := 0
   159  
   160  	for wn < numRows {
   161  		if w.rowbuf.NumRows() >= w.maxRows {
   162  			if err := w.sortAndWriteBufferedRows(); err != nil {
   163  				return wn, err
   164  			}
   165  		}
   166  
   167  		n := int(w.maxRows - w.rowbuf.NumRows())
   168  		n += wn
   169  		if n > numRows {
   170  			n = numRows
   171  		}
   172  
   173  		n, err := writeRows(wn, n)
   174  		wn += n
   175  
   176  		if err != nil {
   177  			return wn, err
   178  		}
   179  	}
   180  
   181  	return wn, nil
   182  }
   183  
   184  func (w *SortingWriter[T]) SetKeyValueMetadata(key, value string) {
   185  	w.output.SetKeyValueMetadata(key, value)
   186  }
   187  
   188  func (w *SortingWriter[T]) Schema() *Schema {
   189  	return w.output.Schema()
   190  }
   191  
   192  func (w *SortingWriter[T]) sortAndWriteBufferedRows() error {
   193  	if w.rowbuf.Len() == 0 {
   194  		return nil
   195  	}
   196  
   197  	defer w.rowbuf.Reset()
   198  	sort.Sort(w.rowbuf)
   199  
   200  	if w.sorting.DropDuplicatedRows {
   201  		w.rowbuf.rows = w.rowbuf.rows[:w.dedupe.deduplicate(w.rowbuf.rows, w.rowbuf.compare)]
   202  		defer w.dedupe.reset()
   203  	}
   204  
   205  	rows := w.rowbuf.Rows()
   206  	defer rows.Close()
   207  
   208  	if w.buffer == nil {
   209  		w.buffer = w.sorting.SortingBuffers.GetBuffer()
   210  		w.writer.Reset(w.buffer)
   211  	}
   212  
   213  	n, err := CopyRows(w.writer, rows)
   214  	if err != nil {
   215  		return err
   216  	}
   217  
   218  	if err := w.writer.Flush(); err != nil {
   219  		return err
   220  	}
   221  
   222  	w.numRows += n
   223  	return nil
   224  }