github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/dedupe.go (about) 1 package parquet 2 3 // DedupeRowReader constructs a row reader which drops duplicated consecutive 4 // rows, according to the comparator function passed as argument. 5 // 6 // If the underlying reader produces a sequence of rows sorted by the same 7 // comparison predicate, the output is guaranteed to produce unique rows only. 8 func DedupeRowReader(reader RowReader, compare func(Row, Row) int) RowReader { 9 return &dedupeRowReader{reader: reader, compare: compare} 10 } 11 12 type dedupeRowReader struct { 13 reader RowReader 14 compare func(Row, Row) int 15 dedupe 16 } 17 18 func (d *dedupeRowReader) ReadRows(rows []Row) (int, error) { 19 for { 20 n, err := d.reader.ReadRows(rows) 21 n = d.deduplicate(rows[:n], d.compare) 22 23 if n > 0 || err != nil { 24 return n, err 25 } 26 } 27 } 28 29 // DedupeRowWriter constructs a row writer which drops duplicated consecutive 30 // rows, according to the comparator function passed as argument. 31 // 32 // If the writer is given a sequence of rows sorted by the same comparison 33 // predicate, the output is guaranteed to contain unique rows only. 34 func DedupeRowWriter(writer RowWriter, compare func(Row, Row) int) RowWriter { 35 return &dedupeRowWriter{writer: writer, compare: compare} 36 } 37 38 type dedupeRowWriter struct { 39 writer RowWriter 40 compare func(Row, Row) int 41 dedupe 42 rows []Row 43 } 44 45 func (d *dedupeRowWriter) WriteRows(rows []Row) (int, error) { 46 // We need to make a copy because we cannot modify the rows slice received 47 // as argument to respect the RowWriter contract. 48 d.rows = append(d.rows[:0], rows...) 49 defer func() { 50 for i := range d.rows { 51 d.rows[i] = Row{} 52 } 53 }() 54 55 if n := d.deduplicate(d.rows, d.compare); n > 0 { 56 w, err := d.writer.WriteRows(d.rows[:n]) 57 if err != nil { 58 return w, err 59 } 60 } 61 62 // Return the number of rows received instead of the number of deduplicated 63 // rows actually written to the underlying writer because we have to repsect 64 // the RowWriter contract. 65 return len(rows), nil 66 } 67 68 type dedupe struct { 69 alloc rowAllocator 70 lastRow Row 71 uniq []Row 72 dupe []Row 73 } 74 75 func (d *dedupe) reset() { 76 d.alloc.reset() 77 d.lastRow = d.lastRow[:0] 78 } 79 80 func (d *dedupe) deduplicate(rows []Row, compare func(Row, Row) int) int { 81 defer func() { 82 for i := range d.uniq { 83 d.uniq[i] = Row{} 84 } 85 for i := range d.dupe { 86 d.dupe[i] = Row{} 87 } 88 d.uniq = d.uniq[:0] 89 d.dupe = d.dupe[:0] 90 }() 91 92 lastRow := d.lastRow 93 94 for _, row := range rows { 95 if len(lastRow) != 0 && compare(row, lastRow) == 0 { 96 d.dupe = append(d.dupe, row) 97 } else { 98 lastRow = row 99 d.uniq = append(d.uniq, row) 100 } 101 } 102 103 rows = rows[:0] 104 rows = append(rows, d.uniq...) 105 rows = append(rows, d.dupe...) 106 107 d.alloc.reset() 108 d.alloc.capture(lastRow) 109 d.lastRow = append(d.lastRow[:0], lastRow...) 110 return len(d.uniq) 111 }