github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/sorting.go (about)

     1  //go:build go1.18
     2  
     3  package parquet
     4  
     5  import (
     6  	"io"
     7  	"sort"
     8  )
     9  
    10  // SortingWriter is a type similar to GenericWriter but it ensures that rows
    11  // are sorted according to the sorting columns configured on the writer.
    12  //
    13  // The writer accumulates rows in an in-memory buffer which is sorted when it
    14  // reaches the target number of rows, then written to a temporary row group.
    15  // When the writer is flushed or closed, the temporary row groups are merged
    16  // into a row group in the output file, ensuring that rows remain sorted in the
    17  // final row group.
    18  //
    19  // Because row groups get encoded and compressed, they hold a lot less memory
    20  // than if all rows were retained in memory. Sorting then merging rows chunks
    21  // also tends to be a lot more efficient than sorting all rows in memory as it
    22  // results in better CPU cache utilization since sorting multi-megabyte arrays
    23  // causes a lot of cache misses since the data set cannot be held in CPU caches.
    24  type SortingWriter[T any] struct {
    25  	rowbuf  *RowBuffer[T]
    26  	writer  *GenericWriter[T]
    27  	output  *GenericWriter[T]
    28  	buffer  io.ReadWriteSeeker
    29  	maxRows int64
    30  	numRows int64
    31  	sorting SortingConfig
    32  	dedupe  dedupe
    33  }
    34  
    35  // NewSortingWriter constructs a new sorting writer which writes a parquet file
    36  // where rows of each row group are ordered according to the sorting columns
    37  // configured on the writer.
    38  //
    39  // The sortRowCount argument defines the target number of rows that will be
    40  // sorted in memory before being written to temporary row groups. The greater
    41  // this value the more memory is needed to buffer rows in memory. Choosing a
    42  // value that is too small limits the maximum number of rows that can exist in
    43  // the output file since the writer cannot create more than 32K temporary row
    44  // groups to hold the sorted row chunks.
    45  func NewSortingWriter[T any](output io.Writer, sortRowCount int64, options ...WriterOption) *SortingWriter[T] {
    46  	config, err := NewWriterConfig(options...)
    47  	if err != nil {
    48  		panic(err)
    49  	}
    50  	return &SortingWriter[T]{
    51  		rowbuf: NewRowBuffer[T](&RowGroupConfig{
    52  			Schema:  config.Schema,
    53  			Sorting: config.Sorting,
    54  		}),
    55  		writer: NewGenericWriter[T](io.Discard, &WriterConfig{
    56  			CreatedBy:            config.CreatedBy,
    57  			ColumnPageBuffers:    config.ColumnPageBuffers,
    58  			ColumnIndexSizeLimit: config.ColumnIndexSizeLimit,
    59  			PageBufferSize:       config.PageBufferSize,
    60  			WriteBufferSize:      config.WriteBufferSize,
    61  			DataPageVersion:      config.DataPageVersion,
    62  			Schema:               config.Schema,
    63  			Compression:          config.Compression,
    64  			Sorting:              config.Sorting,
    65  		}),
    66  		output:  NewGenericWriter[T](output, config),
    67  		maxRows: sortRowCount,
    68  		sorting: config.Sorting,
    69  	}
    70  }
    71  
    72  func (w *SortingWriter[T]) Close() error {
    73  	if err := w.Flush(); err != nil {
    74  		return err
    75  	}
    76  	return w.output.Close()
    77  }
    78  
    79  func (w *SortingWriter[T]) Flush() error {
    80  	defer w.resetSortingBuffer()
    81  
    82  	if err := w.sortAndWriteBufferedRows(); err != nil {
    83  		return err
    84  	}
    85  
    86  	if w.numRows == 0 {
    87  		return nil
    88  	}
    89  
    90  	if err := w.writer.Close(); err != nil {
    91  		return err
    92  	}
    93  
    94  	size, err := w.buffer.Seek(0, io.SeekCurrent)
    95  	if err != nil {
    96  		return err
    97  	}
    98  
    99  	f, err := OpenFile(newReaderAt(w.buffer), size,
   100  		&FileConfig{
   101  			SkipPageIndex:    true,
   102  			SkipBloomFilters: true,
   103  			ReadBufferSize:   defaultReadBufferSize,
   104  		},
   105  	)
   106  	if err != nil {
   107  		return err
   108  	}
   109  
   110  	m, err := MergeRowGroups(f.RowGroups(),
   111  		&RowGroupConfig{
   112  			Schema:  w.Schema(),
   113  			Sorting: w.sorting,
   114  		},
   115  	)
   116  	if err != nil {
   117  		return err
   118  	}
   119  
   120  	rows := m.Rows()
   121  	defer rows.Close()
   122  
   123  	reader := RowReader(rows)
   124  	if w.sorting.DropDuplicatedRows {
   125  		reader = DedupeRowReader(rows, w.rowbuf.compare)
   126  	}
   127  
   128  	if _, err := CopyRows(w.output, reader); err != nil {
   129  		return err
   130  	}
   131  
   132  	return w.output.Flush()
   133  }
   134  
   135  func (w *SortingWriter[T]) Reset(output io.Writer) {
   136  	w.output.Reset(output)
   137  	w.rowbuf.Reset()
   138  	w.resetSortingBuffer()
   139  }
   140  
   141  func (w *SortingWriter[T]) resetSortingBuffer() {
   142  	w.writer.Reset(io.Discard)
   143  	w.numRows = 0
   144  
   145  	if w.buffer != nil {
   146  		w.sorting.SortingBuffers.PutBuffer(w.buffer)
   147  		w.buffer = nil
   148  	}
   149  }
   150  
   151  func (w *SortingWriter[T]) Write(rows []T) (int, error) {
   152  	return w.writeRows(len(rows), func(i, j int) (int, error) { return w.rowbuf.Write(rows[i:j]) })
   153  }
   154  
   155  func (w *SortingWriter[T]) WriteRows(rows []Row) (int, error) {
   156  	return w.writeRows(len(rows), func(i, j int) (int, error) { return w.rowbuf.WriteRows(rows[i:j]) })
   157  }
   158  
   159  func (w *SortingWriter[T]) writeRows(numRows int, writeRows func(i, j int) (int, error)) (int, error) {
   160  	wn := 0
   161  
   162  	for wn < numRows {
   163  		if w.rowbuf.NumRows() >= w.maxRows {
   164  			if err := w.sortAndWriteBufferedRows(); err != nil {
   165  				return wn, err
   166  			}
   167  		}
   168  
   169  		n := int(w.maxRows - w.rowbuf.NumRows())
   170  		n += wn
   171  		if n > numRows {
   172  			n = numRows
   173  		}
   174  
   175  		n, err := writeRows(wn, n)
   176  		wn += n
   177  
   178  		if err != nil {
   179  			return wn, err
   180  		}
   181  	}
   182  
   183  	return wn, nil
   184  }
   185  
   186  func (w *SortingWriter[T]) SetKeyValueMetadata(key, value string) {
   187  	w.output.SetKeyValueMetadata(key, value)
   188  }
   189  
   190  func (w *SortingWriter[T]) Schema() *Schema {
   191  	return w.output.Schema()
   192  }
   193  
   194  func (w *SortingWriter[T]) sortAndWriteBufferedRows() error {
   195  	if w.rowbuf.Len() == 0 {
   196  		return nil
   197  	}
   198  
   199  	defer w.rowbuf.Reset()
   200  	sort.Sort(w.rowbuf)
   201  
   202  	if w.sorting.DropDuplicatedRows {
   203  		w.rowbuf.rows = w.rowbuf.rows[:w.dedupe.deduplicate(w.rowbuf.rows, w.rowbuf.compare)]
   204  		defer w.dedupe.reset()
   205  	}
   206  
   207  	rows := w.rowbuf.Rows()
   208  	defer rows.Close()
   209  
   210  	if w.buffer == nil {
   211  		w.buffer = w.sorting.SortingBuffers.GetBuffer()
   212  		w.writer.Reset(w.buffer)
   213  	}
   214  
   215  	n, err := CopyRows(w.writer, rows)
   216  	if err != nil {
   217  		return err
   218  	}
   219  
   220  	if err := w.writer.Flush(); err != nil {
   221  		return err
   222  	}
   223  
   224  	w.numRows += n
   225  	return nil
   226  }