kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/go/platform/delimited/dedup/dedup.go (about)

     1  /*
     2   * Copyright 2016 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  // Package dedup implements a duplication-reducing reader for streams of
    18  // length-delimited byte records.  Each record is read as a varint-encoded
    19  // length in bytes, followed immediately by the record itself.
    20  //
    21  // A stream consists of a sequence of such records packed consecutively without
    22  // additional padding.  There are no checksums or compression.
    23  // See also: kythe.io/kythe/go/platform/delimited.
    24  package dedup // import "kythe.io/kythe/go/platform/delimited/dedup"
    25  
    26  import (
    27  	"io"
    28  
    29  	"kythe.io/kythe/go/platform/delimited"
    30  	"kythe.io/kythe/go/util/dedup"
    31  
    32  	"google.golang.org/protobuf/proto"
    33  )
    34  
    35  // Reader implements the Reader interface.  Duplicate records are removed by
    36  // hashing each and checking against a set of known record hashes.  This is a
    37  // quick-and-dirty method of removing duplicates; it will not be perfect.
    38  type Reader struct {
    39  	r *delimited.Reader
    40  	d *dedup.Deduper
    41  }
    42  
    43  // NewReader returns a reader that consumes records from r, using a cache of up
    44  // to maxSize bytes for known record hashes.
    45  func NewReader(r io.Reader, maxSize int) (*Reader, error) {
    46  	d, err := dedup.New(maxSize)
    47  	if err != nil {
    48  		return nil, err
    49  	}
    50  	return &Reader{delimited.NewReader(r), d}, nil
    51  }
    52  
    53  // Next returns the next length-delimited record from the input, or io.EOF if
    54  // there are no more records available.  Returns io.ErrUnexpectedEOF if a short
    55  // record is found, with a length of n but fewer than n bytes of data.  Because
    56  // there is no resynchronization mechanism, it is generally not possible to
    57  // recover from a short record in this format.
    58  //
    59  // The slice returned is valid only until a subsequent call to Next.
    60  func (u *Reader) Next() ([]byte, error) {
    61  	for {
    62  		rec, err := u.r.Next()
    63  		if err != nil {
    64  			return nil, err
    65  		} else if u.d.IsUnique(rec) {
    66  			return rec, nil
    67  		}
    68  	}
    69  }
    70  
    71  // NextProto consumes the next available record by calling r.Next, and decodes
    72  // it into pb with proto.Unmarshal.
    73  func (u *Reader) NextProto(pb proto.Message) error {
    74  	rec, err := u.Next()
    75  	if err != nil {
    76  		return err
    77  	}
    78  	return proto.Unmarshal(rec, pb)
    79  }
    80  
    81  // Skipped returns the number of records that have been skipped so far by the
    82  // deduplication process.
    83  func (u *Reader) Skipped() uint64 { return u.d.Duplicates() }