kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/go/platform/delimited/dedup/dedup.go (about) 1 /* 2 * Copyright 2016 The Kythe Authors. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 // Package dedup implements a duplication-reducing reader for streams of 18 // length-delimited byte records. Each record is read as a varint-encoded 19 // length in bytes, followed immediately by the record itself. 20 // 21 // A stream consists of a sequence of such records packed consecutively without 22 // additional padding. There are no checksums or compression. 23 // See also: kythe.io/kythe/go/platform/delimited. 24 package dedup // import "kythe.io/kythe/go/platform/delimited/dedup" 25 26 import ( 27 "io" 28 29 "kythe.io/kythe/go/platform/delimited" 30 "kythe.io/kythe/go/util/dedup" 31 32 "google.golang.org/protobuf/proto" 33 ) 34 35 // Reader implements the Reader interface. Duplicate records are removed by 36 // hashing each and checking against a set of known record hashes. This is a 37 // quick-and-dirty method of removing duplicates; it will not be perfect. 38 type Reader struct { 39 r *delimited.Reader 40 d *dedup.Deduper 41 } 42 43 // NewReader returns a reader that consumes records from r, using a cache of up 44 // to maxSize bytes for known record hashes. 45 func NewReader(r io.Reader, maxSize int) (*Reader, error) { 46 d, err := dedup.New(maxSize) 47 if err != nil { 48 return nil, err 49 } 50 return &Reader{delimited.NewReader(r), d}, nil 51 } 52 53 // Next returns the next length-delimited record from the input, or io.EOF if 54 // there are no more records available. Returns io.ErrUnexpectedEOF if a short 55 // record is found, with a length of n but fewer than n bytes of data. Because 56 // there is no resynchronization mechanism, it is generally not possible to 57 // recover from a short record in this format. 58 // 59 // The slice returned is valid only until a subsequent call to Next. 60 func (u *Reader) Next() ([]byte, error) { 61 for { 62 rec, err := u.r.Next() 63 if err != nil { 64 return nil, err 65 } else if u.d.IsUnique(rec) { 66 return rec, nil 67 } 68 } 69 } 70 71 // NextProto consumes the next available record by calling r.Next, and decodes 72 // it into pb with proto.Unmarshal. 73 func (u *Reader) NextProto(pb proto.Message) error { 74 rec, err := u.Next() 75 if err != nil { 76 return err 77 } 78 return proto.Unmarshal(rec, pb) 79 } 80 81 // Skipped returns the number of records that have been skipped so far by the 82 // deduplication process. 83 func (u *Reader) Skipped() uint64 { return u.d.Duplicates() }