kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/go/platform/tools/dedup_stream/dedup_stream.go (about)

     1  /*
     2   * Copyright 2015 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  // Binary dedup_stream reads a delimited stream from stdin and writes a delimited stream to stdout.
    18  // Each record in the stream will be hashed, and if that hash value has already been seen, the
    19  // record will not be emitted.
    20  package main
    21  
    22  import (
    23  	"flag"
    24  	"os"
    25  
    26  	"kythe.io/kythe/go/platform/delimited"
    27  	"kythe.io/kythe/go/platform/delimited/dedup"
    28  	"kythe.io/kythe/go/util/datasize"
    29  	"kythe.io/kythe/go/util/flagutil"
    30  	"kythe.io/kythe/go/util/log"
    31  )
    32  
    33  func init() {
    34  	flag.Usage = flagutil.SimpleUsage("Remove duplicate records from a delimited stream")
    35  }
    36  
    37  var cacheSize = datasize.Flag("cache_size", "3GiB", `Maximum size of the cache of known record hashes (e.g. "10B", "12KB", "3GiB", etc.)`)
    38  
    39  func main() {
    40  	flag.Parse()
    41  	if flag.NArg() != 0 {
    42  		flagutil.UsageErrorf("unknown arguments: %v", flag.Args())
    43  	}
    44  
    45  	rd, err := dedup.NewReader(os.Stdin, int(cacheSize.Bytes()))
    46  	if err != nil {
    47  		log.Fatalf("Error creating UniqReader: %v", err)
    48  	}
    49  	wr := delimited.NewWriter(os.Stdout)
    50  	if err := delimited.Copy(wr, rd); err != nil {
    51  		log.Fatal(err)
    52  	}
    53  	log.Infof("dedup_stream: skipped %d records", rd.Skipped())
    54  }