github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/sorting.go (about) 1 //go:build go1.18 2 3 package parquet 4 5 import ( 6 "io" 7 "sort" 8 ) 9 10 // SortingWriter is a type similar to GenericWriter but it ensures that rows 11 // are sorted according to the sorting columns configured on the writer. 12 // 13 // The writer accumulates rows in an in-memory buffer which is sorted when it 14 // reaches the target number of rows, then written to a temporary row group. 15 // When the writer is flushed or closed, the temporary row groups are merged 16 // into a row group in the output file, ensuring that rows remain sorted in the 17 // final row group. 18 // 19 // Because row groups get encoded and compressed, they hold a lot less memory 20 // than if all rows were retained in memory. Sorting then merging rows chunks 21 // also tends to be a lot more efficient than sorting all rows in memory as it 22 // results in better CPU cache utilization since sorting multi-megabyte arrays 23 // causes a lot of cache misses since the data set cannot be held in CPU caches. 24 type SortingWriter[T any] struct { 25 rowbuf *RowBuffer[T] 26 writer *GenericWriter[T] 27 output *GenericWriter[T] 28 buffer io.ReadWriteSeeker 29 maxRows int64 30 numRows int64 31 sorting SortingConfig 32 dedupe dedupe 33 } 34 35 // NewSortingWriter constructs a new sorting writer which writes a parquet file 36 // where rows of each row group are ordered according to the sorting columns 37 // configured on the writer. 38 // 39 // The sortRowCount argument defines the target number of rows that will be 40 // sorted in memory before being written to temporary row groups. The greater 41 // this value the more memory is needed to buffer rows in memory. Choosing a 42 // value that is too small limits the maximum number of rows that can exist in 43 // the output file since the writer cannot create more than 32K temporary row 44 // groups to hold the sorted row chunks. 45 func NewSortingWriter[T any](output io.Writer, sortRowCount int64, options ...WriterOption) *SortingWriter[T] { 46 config, err := NewWriterConfig(options...) 47 if err != nil { 48 panic(err) 49 } 50 return &SortingWriter[T]{ 51 rowbuf: NewRowBuffer[T](&RowGroupConfig{ 52 Schema: config.Schema, 53 Sorting: config.Sorting, 54 }), 55 writer: NewGenericWriter[T](io.Discard, &WriterConfig{ 56 CreatedBy: config.CreatedBy, 57 ColumnPageBuffers: config.ColumnPageBuffers, 58 ColumnIndexSizeLimit: config.ColumnIndexSizeLimit, 59 PageBufferSize: config.PageBufferSize, 60 WriteBufferSize: config.WriteBufferSize, 61 DataPageVersion: config.DataPageVersion, 62 Schema: config.Schema, 63 Compression: config.Compression, 64 Sorting: config.Sorting, 65 }), 66 output: NewGenericWriter[T](output, config), 67 maxRows: sortRowCount, 68 sorting: config.Sorting, 69 } 70 } 71 72 func (w *SortingWriter[T]) Close() error { 73 if err := w.Flush(); err != nil { 74 return err 75 } 76 return w.output.Close() 77 } 78 79 func (w *SortingWriter[T]) Flush() error { 80 defer w.resetSortingBuffer() 81 82 if err := w.sortAndWriteBufferedRows(); err != nil { 83 return err 84 } 85 86 if w.numRows == 0 { 87 return nil 88 } 89 90 if err := w.writer.Close(); err != nil { 91 return err 92 } 93 94 size, err := w.buffer.Seek(0, io.SeekCurrent) 95 if err != nil { 96 return err 97 } 98 99 f, err := OpenFile(newReaderAt(w.buffer), size, 100 &FileConfig{ 101 SkipPageIndex: true, 102 SkipBloomFilters: true, 103 ReadBufferSize: defaultReadBufferSize, 104 }, 105 ) 106 if err != nil { 107 return err 108 } 109 110 m, err := MergeRowGroups(f.RowGroups(), 111 &RowGroupConfig{ 112 Schema: w.Schema(), 113 Sorting: w.sorting, 114 }, 115 ) 116 if err != nil { 117 return err 118 } 119 120 rows := m.Rows() 121 defer rows.Close() 122 123 reader := RowReader(rows) 124 if w.sorting.DropDuplicatedRows { 125 reader = DedupeRowReader(rows, w.rowbuf.compare) 126 } 127 128 if _, err := CopyRows(w.output, reader); err != nil { 129 return err 130 } 131 132 return w.output.Flush() 133 } 134 135 func (w *SortingWriter[T]) Reset(output io.Writer) { 136 w.output.Reset(output) 137 w.rowbuf.Reset() 138 w.resetSortingBuffer() 139 } 140 141 func (w *SortingWriter[T]) resetSortingBuffer() { 142 w.writer.Reset(io.Discard) 143 w.numRows = 0 144 145 if w.buffer != nil { 146 w.sorting.SortingBuffers.PutBuffer(w.buffer) 147 w.buffer = nil 148 } 149 } 150 151 func (w *SortingWriter[T]) Write(rows []T) (int, error) { 152 return w.writeRows(len(rows), func(i, j int) (int, error) { return w.rowbuf.Write(rows[i:j]) }) 153 } 154 155 func (w *SortingWriter[T]) WriteRows(rows []Row) (int, error) { 156 return w.writeRows(len(rows), func(i, j int) (int, error) { return w.rowbuf.WriteRows(rows[i:j]) }) 157 } 158 159 func (w *SortingWriter[T]) writeRows(numRows int, writeRows func(i, j int) (int, error)) (int, error) { 160 wn := 0 161 162 for wn < numRows { 163 if w.rowbuf.NumRows() >= w.maxRows { 164 if err := w.sortAndWriteBufferedRows(); err != nil { 165 return wn, err 166 } 167 } 168 169 n := int(w.maxRows - w.rowbuf.NumRows()) 170 n += wn 171 if n > numRows { 172 n = numRows 173 } 174 175 n, err := writeRows(wn, n) 176 wn += n 177 178 if err != nil { 179 return wn, err 180 } 181 } 182 183 return wn, nil 184 } 185 186 func (w *SortingWriter[T]) SetKeyValueMetadata(key, value string) { 187 w.output.SetKeyValueMetadata(key, value) 188 } 189 190 func (w *SortingWriter[T]) Schema() *Schema { 191 return w.output.Schema() 192 } 193 194 func (w *SortingWriter[T]) sortAndWriteBufferedRows() error { 195 if w.rowbuf.Len() == 0 { 196 return nil 197 } 198 199 defer w.rowbuf.Reset() 200 sort.Sort(w.rowbuf) 201 202 if w.sorting.DropDuplicatedRows { 203 w.rowbuf.rows = w.rowbuf.rows[:w.dedupe.deduplicate(w.rowbuf.rows, w.rowbuf.compare)] 204 defer w.dedupe.reset() 205 } 206 207 rows := w.rowbuf.Rows() 208 defer rows.Close() 209 210 if w.buffer == nil { 211 w.buffer = w.sorting.SortingBuffers.GetBuffer() 212 w.writer.Reset(w.buffer) 213 } 214 215 n, err := CopyRows(w.writer, rows) 216 if err != nil { 217 return err 218 } 219 220 if err := w.writer.Flush(); err != nil { 221 return err 222 } 223 224 w.numRows += n 225 return nil 226 }