github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/sorting.go (about) 1 package parquet 2 3 import ( 4 "io" 5 "sort" 6 ) 7 8 // SortingWriter is a type similar to GenericWriter but it ensures that rows 9 // are sorted according to the sorting columns configured on the writer. 10 // 11 // The writer accumulates rows in an in-memory buffer which is sorted when it 12 // reaches the target number of rows, then written to a temporary row group. 13 // When the writer is flushed or closed, the temporary row groups are merged 14 // into a row group in the output file, ensuring that rows remain sorted in the 15 // final row group. 16 // 17 // Because row groups get encoded and compressed, they hold a lot less memory 18 // than if all rows were retained in memory. Sorting then merging rows chunks 19 // also tends to be a lot more efficient than sorting all rows in memory as it 20 // results in better CPU cache utilization since sorting multi-megabyte arrays 21 // causes a lot of cache misses since the data set cannot be held in CPU caches. 22 type SortingWriter[T any] struct { 23 rowbuf *RowBuffer[T] 24 writer *GenericWriter[T] 25 output *GenericWriter[T] 26 buffer io.ReadWriteSeeker 27 maxRows int64 28 numRows int64 29 sorting SortingConfig 30 dedupe dedupe 31 } 32 33 // NewSortingWriter constructs a new sorting writer which writes a parquet file 34 // where rows of each row group are ordered according to the sorting columns 35 // configured on the writer. 36 // 37 // The sortRowCount argument defines the target number of rows that will be 38 // sorted in memory before being written to temporary row groups. The greater 39 // this value the more memory is needed to buffer rows in memory. Choosing a 40 // value that is too small limits the maximum number of rows that can exist in 41 // the output file since the writer cannot create more than 32K temporary row 42 // groups to hold the sorted row chunks. 43 func NewSortingWriter[T any](output io.Writer, sortRowCount int64, options ...WriterOption) *SortingWriter[T] { 44 config, err := NewWriterConfig(options...) 45 if err != nil { 46 panic(err) 47 } 48 return &SortingWriter[T]{ 49 rowbuf: NewRowBuffer[T](&RowGroupConfig{ 50 Schema: config.Schema, 51 Sorting: config.Sorting, 52 }), 53 writer: NewGenericWriter[T](io.Discard, &WriterConfig{ 54 CreatedBy: config.CreatedBy, 55 ColumnPageBuffers: config.ColumnPageBuffers, 56 ColumnIndexSizeLimit: config.ColumnIndexSizeLimit, 57 PageBufferSize: config.PageBufferSize, 58 WriteBufferSize: config.WriteBufferSize, 59 DataPageVersion: config.DataPageVersion, 60 Schema: config.Schema, 61 Compression: config.Compression, 62 Sorting: config.Sorting, 63 }), 64 output: NewGenericWriter[T](output, config), 65 maxRows: sortRowCount, 66 sorting: config.Sorting, 67 } 68 } 69 70 func (w *SortingWriter[T]) Close() error { 71 if err := w.Flush(); err != nil { 72 return err 73 } 74 return w.output.Close() 75 } 76 77 func (w *SortingWriter[T]) Flush() error { 78 defer w.resetSortingBuffer() 79 80 if err := w.sortAndWriteBufferedRows(); err != nil { 81 return err 82 } 83 84 if w.numRows == 0 { 85 return nil 86 } 87 88 if err := w.writer.Close(); err != nil { 89 return err 90 } 91 92 size, err := w.buffer.Seek(0, io.SeekCurrent) 93 if err != nil { 94 return err 95 } 96 97 f, err := OpenFile(newReaderAt(w.buffer), size, 98 &FileConfig{ 99 SkipPageIndex: true, 100 SkipBloomFilters: true, 101 ReadBufferSize: defaultReadBufferSize, 102 }, 103 ) 104 if err != nil { 105 return err 106 } 107 108 m, err := MergeRowGroups(f.RowGroups(), 109 &RowGroupConfig{ 110 Schema: w.Schema(), 111 Sorting: w.sorting, 112 }, 113 ) 114 if err != nil { 115 return err 116 } 117 118 rows := m.Rows() 119 defer rows.Close() 120 121 reader := RowReader(rows) 122 if w.sorting.DropDuplicatedRows { 123 reader = DedupeRowReader(rows, w.rowbuf.compare) 124 } 125 126 if _, err := CopyRows(w.output, reader); err != nil { 127 return err 128 } 129 130 return w.output.Flush() 131 } 132 133 func (w *SortingWriter[T]) Reset(output io.Writer) { 134 w.output.Reset(output) 135 w.rowbuf.Reset() 136 w.resetSortingBuffer() 137 } 138 139 func (w *SortingWriter[T]) resetSortingBuffer() { 140 w.writer.Reset(io.Discard) 141 w.numRows = 0 142 143 if w.buffer != nil { 144 w.sorting.SortingBuffers.PutBuffer(w.buffer) 145 w.buffer = nil 146 } 147 } 148 149 func (w *SortingWriter[T]) Write(rows []T) (int, error) { 150 return w.writeRows(len(rows), func(i, j int) (int, error) { return w.rowbuf.Write(rows[i:j]) }) 151 } 152 153 func (w *SortingWriter[T]) WriteRows(rows []Row) (int, error) { 154 return w.writeRows(len(rows), func(i, j int) (int, error) { return w.rowbuf.WriteRows(rows[i:j]) }) 155 } 156 157 func (w *SortingWriter[T]) writeRows(numRows int, writeRows func(i, j int) (int, error)) (int, error) { 158 wn := 0 159 160 for wn < numRows { 161 if w.rowbuf.NumRows() >= w.maxRows { 162 if err := w.sortAndWriteBufferedRows(); err != nil { 163 return wn, err 164 } 165 } 166 167 n := int(w.maxRows - w.rowbuf.NumRows()) 168 n += wn 169 if n > numRows { 170 n = numRows 171 } 172 173 n, err := writeRows(wn, n) 174 wn += n 175 176 if err != nil { 177 return wn, err 178 } 179 } 180 181 return wn, nil 182 } 183 184 func (w *SortingWriter[T]) SetKeyValueMetadata(key, value string) { 185 w.output.SetKeyValueMetadata(key, value) 186 } 187 188 func (w *SortingWriter[T]) Schema() *Schema { 189 return w.output.Schema() 190 } 191 192 func (w *SortingWriter[T]) sortAndWriteBufferedRows() error { 193 if w.rowbuf.Len() == 0 { 194 return nil 195 } 196 197 defer w.rowbuf.Reset() 198 sort.Sort(w.rowbuf) 199 200 if w.sorting.DropDuplicatedRows { 201 w.rowbuf.rows = w.rowbuf.rows[:w.dedupe.deduplicate(w.rowbuf.rows, w.rowbuf.compare)] 202 defer w.dedupe.reset() 203 } 204 205 rows := w.rowbuf.Rows() 206 defer rows.Close() 207 208 if w.buffer == nil { 209 w.buffer = w.sorting.SortingBuffers.GetBuffer() 210 w.writer.Reset(w.buffer) 211 } 212 213 n, err := CopyRows(w.writer, rows) 214 if err != nil { 215 return err 216 } 217 218 if err := w.writer.Flush(); err != nil { 219 return err 220 } 221 222 w.numRows += n 223 return nil 224 }