kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/go/platform/tools/entrystream/entrystream.go (about)

     1  /*
     2   * Copyright 2014 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  // Binary entrystream provides tools to manipulate a stream of delimited Entry
    18  // messages. By default, entrystream does nothing to the entry stream.
    19  //
    20  // Examples:
    21  //
    22  //	$ ... | entrystream                      # Passes through proto entry stream unchanged
    23  //	$ ... | entrystream --sort               # Sorts the entry stream into GraphStore order
    24  //	$ ... | entrystream --write_format=json  # Prints entry stream as JSON
    25  //	$ ... | entrystream --entrysets          # Prints combined entry sets as JSON
    26  //	$ ... | entrystream --count              # Prints the number of entries in the incoming stream
    27  //	$ ... | entrystream --read_format=json   # Reads entry stream as JSON and prints a proto stream
    28  //
    29  //	$ ... | entrystream --write_format=riegeli # Writes entry stream as a Riegeli file
    30  //	$ ... | entrystream --read_format=riegeli  # Reads the entry stream from a Riegeli file
    31  package main
    32  
    33  import (
    34  	"bufio"
    35  	"encoding/json"
    36  	"flag"
    37  	"fmt"
    38  	"io"
    39  	"os"
    40  	"strings"
    41  
    42  	"kythe.io/kythe/go/platform/delimited"
    43  	"kythe.io/kythe/go/storage/entryset"
    44  	"kythe.io/kythe/go/storage/stream"
    45  	"kythe.io/kythe/go/util/compare"
    46  	"kythe.io/kythe/go/util/disksort"
    47  	"kythe.io/kythe/go/util/flagutil"
    48  	"kythe.io/kythe/go/util/log"
    49  	"kythe.io/kythe/go/util/riegeli"
    50  
    51  	"google.golang.org/protobuf/encoding/prototext"
    52  	"google.golang.org/protobuf/proto"
    53  
    54  	spb "kythe.io/kythe/proto/storage_go_proto"
    55  )
    56  
    57  type entrySet struct { // TODO(schroederc): rename to avoid confusion with EntrySet proto
    58  	Source   *spb.VName `json:"source"`
    59  	Target   *spb.VName `json:"target,omitempty"`
    60  	EdgeKind string     `json:"edge_kind,omitempty"`
    61  
    62  	Properties map[string]json.RawMessage `json:"properties"`
    63  }
    64  
    65  // Accepted --{read,write}_format values
    66  const (
    67  	delimitedFormat = "delimited"
    68  	jsonFormat      = "json"
    69  	riegeliFormat   = "riegeli"
    70  	textprotoFormat = "textproto"
    71  )
    72  
    73  var (
    74  	readJSON  = flag.Bool("read_json", false, "Assume stdin is a stream of JSON entries instead of protobufs (deprecated: use --read_format)")
    75  	writeJSON = flag.Bool("write_json", false, "Print JSON stream as output (deprecated: use --write_format)")
    76  
    77  	readFormat  = flag.String("read_format", delimitedFormat, "Format of the input stream (accepted formats: {delimited,json,riegeli})")
    78  	writeFormat = flag.String("write_format", delimitedFormat, "Format of the output stream (accepted formats: {delimited,json,riegeli,textproto})")
    79  
    80  	riegeliOptions = flag.String("riegeli_writer_options", "", "Riegeli writer options")
    81  
    82  	sortStream  = flag.Bool("sort", false, "Sort entry stream into GraphStore order")
    83  	uniqEntries = flag.Bool("unique", false, "Print only unique entries (implies --sort)")
    84  
    85  	aggregateEntrySet = flag.Bool("aggregate_entryset", false, "Output a single aggregate EntrySet proto")
    86  	entrySets         = flag.Bool("entrysets", false, "Print Entry protos as JSON EntrySets (implies --sort and --write_format=json)")
    87  	countOnly         = flag.Bool("count", false, "Only print the count of protos streamed")
    88  
    89  	structuredFacts = flag.Bool("structured_facts", false, "Encode and/or decode the fact_value for marked source facts")
    90  )
    91  
    92  func init() {
    93  	flag.Usage = flagutil.SimpleUsage("Manipulate a stream of Entry messages",
    94  		"[--read_format=<format>] [--unique] ([--write_format=<format>] [--sort] | [--entrysets] | [--count] | [--aggregate_entryset])")
    95  }
    96  
    97  func main() {
    98  	flag.Parse()
    99  	if len(flag.Args()) > 0 {
   100  		flagutil.UsageErrorf("unknown arguments: %v", flag.Args())
   101  	}
   102  
   103  	// Normalize --{read,write}_format values
   104  	*readFormat = strings.ToLower(*readFormat)
   105  	*writeFormat = strings.ToLower(*writeFormat)
   106  
   107  	if *readJSON {
   108  		log.Warningf("--read_json is deprecated; use --read_format=json")
   109  		*readFormat = jsonFormat
   110  	}
   111  	if *writeJSON {
   112  		log.Warningf("--write_json is deprecated; use --write_format=json")
   113  		*writeFormat = jsonFormat
   114  	}
   115  
   116  	in := bufio.NewReaderSize(os.Stdin, 2*4096)
   117  	out := bufio.NewWriter(os.Stdout)
   118  
   119  	var rd stream.EntryReader
   120  	switch *readFormat {
   121  	case jsonFormat:
   122  		if *structuredFacts {
   123  			rd = stream.NewStructuredJSONReader(in)
   124  		} else {
   125  			rd = stream.NewJSONReader(in)
   126  		}
   127  	case riegeliFormat:
   128  		rd = func(emit func(*spb.Entry) error) error {
   129  			r := riegeli.NewReader(in)
   130  			for {
   131  				rec, err := r.Next()
   132  				if err == io.EOF {
   133  					return nil
   134  				} else if err != nil {
   135  					return err
   136  				}
   137  				var e spb.Entry
   138  				if err := proto.Unmarshal(rec, &e); err != nil {
   139  					return err
   140  				} else if err := emit(&e); err != nil {
   141  					return err
   142  				}
   143  			}
   144  		}
   145  	case delimitedFormat:
   146  		rd = stream.NewReader(in)
   147  	default:
   148  		log.Fatalf("Unsupported --read_format=%s", *readFormat)
   149  	}
   150  
   151  	if *sortStream || *entrySets || *uniqEntries {
   152  		var err error
   153  		rd, err = sortEntries(rd)
   154  		failOnErr(err)
   155  	}
   156  
   157  	if *uniqEntries {
   158  		rd = dedupEntries(rd)
   159  	}
   160  
   161  	switch {
   162  	case *countOnly:
   163  		var count int
   164  		failOnErr(rd(func(_ *spb.Entry) error {
   165  			count++
   166  			return nil
   167  		}))
   168  		fmt.Println(count)
   169  	case *aggregateEntrySet:
   170  		es := entryset.New(nil)
   171  		failOnErr(rd(es.Add))
   172  		pb := es.Encode()
   173  		switch *writeFormat {
   174  		case jsonFormat:
   175  			encoder := json.NewEncoder(out)
   176  			failOnErr(encoder.Encode(pb))
   177  		case riegeliFormat:
   178  			opts, err := riegeli.ParseOptions(*riegeliOptions)
   179  			failOnErr(err)
   180  			wr := riegeli.NewWriter(out, opts)
   181  			failOnErr(wr.PutProto(pb))
   182  			failOnErr(wr.Flush())
   183  		case delimitedFormat:
   184  			wr := delimited.NewWriter(out)
   185  			failOnErr(wr.PutProto(pb))
   186  		default:
   187  			log.Fatalf("Unsupported --write_format=%s", *writeFormat)
   188  		}
   189  	case *entrySets:
   190  		encoder := json.NewEncoder(out)
   191  		var set entrySet
   192  		failOnErr(rd(func(entry *spb.Entry) error {
   193  			if !compare.VNamesEqual(set.Source, entry.Source) || !compare.VNamesEqual(set.Target, entry.Target) || set.EdgeKind != entry.EdgeKind {
   194  				if len(set.Properties) != 0 {
   195  					if err := encoder.Encode(set); err != nil {
   196  						return err
   197  					}
   198  				}
   199  				set.Source = entry.Source
   200  				set.EdgeKind = entry.EdgeKind
   201  				set.Target = entry.Target
   202  				set.Properties = make(map[string]json.RawMessage)
   203  			}
   204  			var err error
   205  			if *structuredFacts {
   206  				set.Properties[entry.FactName], err = stream.StructuredFactValueJSON(entry)
   207  			} else {
   208  				set.Properties[entry.FactName], err = json.Marshal(entry)
   209  			}
   210  			return err
   211  		}))
   212  		if len(set.Properties) != 0 {
   213  			failOnErr(encoder.Encode(set))
   214  		}
   215  	default:
   216  		switch *writeFormat {
   217  		case jsonFormat:
   218  			encoder := json.NewEncoder(out)
   219  			failOnErr(rd(func(entry *spb.Entry) error {
   220  				if *structuredFacts {
   221  					return encoder.Encode(stream.Structured(entry))
   222  				}
   223  				return encoder.Encode(entry)
   224  			}))
   225  		case riegeliFormat:
   226  			opts, err := riegeli.ParseOptions(*riegeliOptions)
   227  			failOnErr(err)
   228  			wr := riegeli.NewWriter(out, opts)
   229  			failOnErr(rd(func(entry *spb.Entry) error {
   230  				return wr.PutProto(entry)
   231  			}))
   232  			failOnErr(wr.Flush())
   233  		case delimitedFormat:
   234  			wr := delimited.NewWriter(out)
   235  			failOnErr(rd(func(entry *spb.Entry) error {
   236  				return wr.PutProto(entry)
   237  			}))
   238  		case textprotoFormat:
   239  			entries := &spb.Entries{}
   240  			failOnErr(rd(func(entry *spb.Entry) error {
   241  				entries.Entries = append(entries.Entries, entry)
   242  				return nil
   243  			}))
   244  			out.WriteString(prototext.Format(entries))
   245  
   246  		default:
   247  			log.Fatalf("Unsupported --write_format=%s", *writeFormat)
   248  		}
   249  	}
   250  	failOnErr(out.Flush())
   251  }
   252  
   253  func sortEntries(rd stream.EntryReader) (stream.EntryReader, error) {
   254  	sorter, err := disksort.NewMergeSorter(disksort.MergeOptions{
   255  		Lesser:    entryLesser{},
   256  		Marshaler: entryMarshaler{},
   257  	})
   258  	if err != nil {
   259  		return nil, fmt.Errorf("error creating entries sorter: %v", err)
   260  	}
   261  
   262  	if err := rd(func(e *spb.Entry) error {
   263  		return sorter.Add(e)
   264  	}); err != nil {
   265  		return nil, fmt.Errorf("error sorting entries: %v", err)
   266  	}
   267  
   268  	return func(f func(*spb.Entry) error) error {
   269  		return sorter.Read(func(i any) error {
   270  			return f(i.(*spb.Entry))
   271  		})
   272  	}, nil
   273  }
   274  
   275  type entryLesser struct{}
   276  
   277  func (entryLesser) Less(a, b any) bool {
   278  	return compare.Entries(a.(*spb.Entry), b.(*spb.Entry)) == compare.LT
   279  }
   280  
   281  type entryMarshaler struct{}
   282  
   283  func (entryMarshaler) Marshal(x any) ([]byte, error) { return proto.Marshal(x.(proto.Message)) }
   284  
   285  func (entryMarshaler) Unmarshal(rec []byte) (any, error) {
   286  	var e spb.Entry
   287  	return &e, proto.Unmarshal(rec, &e)
   288  }
   289  
   290  func dedupEntries(rd stream.EntryReader) stream.EntryReader {
   291  	return func(f func(*spb.Entry) error) error {
   292  		var last *spb.Entry
   293  		return rd(func(e *spb.Entry) error {
   294  			if compare.Entries(last, e) != compare.EQ {
   295  				last = e
   296  				return f(e)
   297  			}
   298  			return nil
   299  		})
   300  	}
   301  }
   302  
   303  func failOnErr(err error) {
   304  	if err != nil {
   305  		log.Fatal(err)
   306  	}
   307  }