github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/dedupe.go (about)

     1  package parquet
     2  
     3  // DedupeRowReader constructs a row reader which drops duplicated consecutive
     4  // rows, according to the comparator function passed as argument.
     5  //
     6  // If the underlying reader produces a sequence of rows sorted by the same
     7  // comparison predicate, the output is guaranteed to produce unique rows only.
     8  func DedupeRowReader(reader RowReader, compare func(Row, Row) int) RowReader {
     9  	return &dedupeRowReader{reader: reader, compare: compare}
    10  }
    11  
    12  type dedupeRowReader struct {
    13  	reader  RowReader
    14  	compare func(Row, Row) int
    15  	dedupe
    16  }
    17  
    18  func (d *dedupeRowReader) ReadRows(rows []Row) (int, error) {
    19  	for {
    20  		n, err := d.reader.ReadRows(rows)
    21  		n = d.deduplicate(rows[:n], d.compare)
    22  
    23  		if n > 0 || err != nil {
    24  			return n, err
    25  		}
    26  	}
    27  }
    28  
    29  // DedupeRowWriter constructs a row writer which drops duplicated consecutive
    30  // rows, according to the comparator function passed as argument.
    31  //
    32  // If the writer is given a sequence of rows sorted by the same comparison
    33  // predicate, the output is guaranteed to contain unique rows only.
    34  func DedupeRowWriter(writer RowWriter, compare func(Row, Row) int) RowWriter {
    35  	return &dedupeRowWriter{writer: writer, compare: compare}
    36  }
    37  
    38  type dedupeRowWriter struct {
    39  	writer  RowWriter
    40  	compare func(Row, Row) int
    41  	dedupe
    42  	rows []Row
    43  }
    44  
    45  func (d *dedupeRowWriter) WriteRows(rows []Row) (int, error) {
    46  	// We need to make a copy because we cannot modify the rows slice received
    47  	// as argument to respect the RowWriter contract.
    48  	d.rows = append(d.rows[:0], rows...)
    49  	defer func() {
    50  		for i := range d.rows {
    51  			d.rows[i] = Row{}
    52  		}
    53  	}()
    54  
    55  	if n := d.deduplicate(d.rows, d.compare); n > 0 {
    56  		w, err := d.writer.WriteRows(d.rows[:n])
    57  		if err != nil {
    58  			return w, err
    59  		}
    60  	}
    61  
    62  	// Return the number of rows received instead of the number of deduplicated
    63  	// rows actually written to the underlying writer because we have to repsect
    64  	// the RowWriter contract.
    65  	return len(rows), nil
    66  }
    67  
    68  type dedupe struct {
    69  	alloc   rowAllocator
    70  	lastRow Row
    71  	uniq    []Row
    72  	dupe    []Row
    73  }
    74  
    75  func (d *dedupe) reset() {
    76  	d.alloc.reset()
    77  	d.lastRow = d.lastRow[:0]
    78  }
    79  
    80  func (d *dedupe) deduplicate(rows []Row, compare func(Row, Row) int) int {
    81  	defer func() {
    82  		for i := range d.uniq {
    83  			d.uniq[i] = Row{}
    84  		}
    85  		for i := range d.dupe {
    86  			d.dupe[i] = Row{}
    87  		}
    88  		d.uniq = d.uniq[:0]
    89  		d.dupe = d.dupe[:0]
    90  	}()
    91  
    92  	lastRow := d.lastRow
    93  
    94  	for _, row := range rows {
    95  		if len(lastRow) != 0 && compare(row, lastRow) == 0 {
    96  			d.dupe = append(d.dupe, row)
    97  		} else {
    98  			lastRow = row
    99  			d.uniq = append(d.uniq, row)
   100  		}
   101  	}
   102  
   103  	rows = rows[:0]
   104  	rows = append(rows, d.uniq...)
   105  	rows = append(rows, d.dupe...)
   106  
   107  	d.alloc.reset()
   108  	d.alloc.capture(lastRow)
   109  	d.lastRow = append(d.lastRow[:0], lastRow...)
   110  	return len(d.uniq)
   111  }