kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/go/platform/tools/entrystream/entrystream.go (about) 1 /* 2 * Copyright 2014 The Kythe Authors. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 // Binary entrystream provides tools to manipulate a stream of delimited Entry 18 // messages. By default, entrystream does nothing to the entry stream. 19 // 20 // Examples: 21 // 22 // $ ... | entrystream # Passes through proto entry stream unchanged 23 // $ ... | entrystream --sort # Sorts the entry stream into GraphStore order 24 // $ ... | entrystream --write_format=json # Prints entry stream as JSON 25 // $ ... | entrystream --entrysets # Prints combined entry sets as JSON 26 // $ ... | entrystream --count # Prints the number of entries in the incoming stream 27 // $ ... | entrystream --read_format=json # Reads entry stream as JSON and prints a proto stream 28 // 29 // $ ... | entrystream --write_format=riegeli # Writes entry stream as a Riegeli file 30 // $ ... | entrystream --read_format=riegeli # Reads the entry stream from a Riegeli file 31 package main 32 33 import ( 34 "bufio" 35 "encoding/json" 36 "flag" 37 "fmt" 38 "io" 39 "os" 40 "strings" 41 42 "kythe.io/kythe/go/platform/delimited" 43 "kythe.io/kythe/go/storage/entryset" 44 "kythe.io/kythe/go/storage/stream" 45 "kythe.io/kythe/go/util/compare" 46 "kythe.io/kythe/go/util/disksort" 47 "kythe.io/kythe/go/util/flagutil" 48 "kythe.io/kythe/go/util/log" 49 "kythe.io/kythe/go/util/riegeli" 50 51 "google.golang.org/protobuf/encoding/prototext" 52 "google.golang.org/protobuf/proto" 53 54 spb "kythe.io/kythe/proto/storage_go_proto" 55 ) 56 57 type entrySet struct { // TODO(schroederc): rename to avoid confusion with EntrySet proto 58 Source *spb.VName `json:"source"` 59 Target *spb.VName `json:"target,omitempty"` 60 EdgeKind string `json:"edge_kind,omitempty"` 61 62 Properties map[string]json.RawMessage `json:"properties"` 63 } 64 65 // Accepted --{read,write}_format values 66 const ( 67 delimitedFormat = "delimited" 68 jsonFormat = "json" 69 riegeliFormat = "riegeli" 70 textprotoFormat = "textproto" 71 ) 72 73 var ( 74 readJSON = flag.Bool("read_json", false, "Assume stdin is a stream of JSON entries instead of protobufs (deprecated: use --read_format)") 75 writeJSON = flag.Bool("write_json", false, "Print JSON stream as output (deprecated: use --write_format)") 76 77 readFormat = flag.String("read_format", delimitedFormat, "Format of the input stream (accepted formats: {delimited,json,riegeli})") 78 writeFormat = flag.String("write_format", delimitedFormat, "Format of the output stream (accepted formats: {delimited,json,riegeli,textproto})") 79 80 riegeliOptions = flag.String("riegeli_writer_options", "", "Riegeli writer options") 81 82 sortStream = flag.Bool("sort", false, "Sort entry stream into GraphStore order") 83 uniqEntries = flag.Bool("unique", false, "Print only unique entries (implies --sort)") 84 85 aggregateEntrySet = flag.Bool("aggregate_entryset", false, "Output a single aggregate EntrySet proto") 86 entrySets = flag.Bool("entrysets", false, "Print Entry protos as JSON EntrySets (implies --sort and --write_format=json)") 87 countOnly = flag.Bool("count", false, "Only print the count of protos streamed") 88 89 structuredFacts = flag.Bool("structured_facts", false, "Encode and/or decode the fact_value for marked source facts") 90 ) 91 92 func init() { 93 flag.Usage = flagutil.SimpleUsage("Manipulate a stream of Entry messages", 94 "[--read_format=<format>] [--unique] ([--write_format=<format>] [--sort] | [--entrysets] | [--count] | [--aggregate_entryset])") 95 } 96 97 func main() { 98 flag.Parse() 99 if len(flag.Args()) > 0 { 100 flagutil.UsageErrorf("unknown arguments: %v", flag.Args()) 101 } 102 103 // Normalize --{read,write}_format values 104 *readFormat = strings.ToLower(*readFormat) 105 *writeFormat = strings.ToLower(*writeFormat) 106 107 if *readJSON { 108 log.Warningf("--read_json is deprecated; use --read_format=json") 109 *readFormat = jsonFormat 110 } 111 if *writeJSON { 112 log.Warningf("--write_json is deprecated; use --write_format=json") 113 *writeFormat = jsonFormat 114 } 115 116 in := bufio.NewReaderSize(os.Stdin, 2*4096) 117 out := bufio.NewWriter(os.Stdout) 118 119 var rd stream.EntryReader 120 switch *readFormat { 121 case jsonFormat: 122 if *structuredFacts { 123 rd = stream.NewStructuredJSONReader(in) 124 } else { 125 rd = stream.NewJSONReader(in) 126 } 127 case riegeliFormat: 128 rd = func(emit func(*spb.Entry) error) error { 129 r := riegeli.NewReader(in) 130 for { 131 rec, err := r.Next() 132 if err == io.EOF { 133 return nil 134 } else if err != nil { 135 return err 136 } 137 var e spb.Entry 138 if err := proto.Unmarshal(rec, &e); err != nil { 139 return err 140 } else if err := emit(&e); err != nil { 141 return err 142 } 143 } 144 } 145 case delimitedFormat: 146 rd = stream.NewReader(in) 147 default: 148 log.Fatalf("Unsupported --read_format=%s", *readFormat) 149 } 150 151 if *sortStream || *entrySets || *uniqEntries { 152 var err error 153 rd, err = sortEntries(rd) 154 failOnErr(err) 155 } 156 157 if *uniqEntries { 158 rd = dedupEntries(rd) 159 } 160 161 switch { 162 case *countOnly: 163 var count int 164 failOnErr(rd(func(_ *spb.Entry) error { 165 count++ 166 return nil 167 })) 168 fmt.Println(count) 169 case *aggregateEntrySet: 170 es := entryset.New(nil) 171 failOnErr(rd(es.Add)) 172 pb := es.Encode() 173 switch *writeFormat { 174 case jsonFormat: 175 encoder := json.NewEncoder(out) 176 failOnErr(encoder.Encode(pb)) 177 case riegeliFormat: 178 opts, err := riegeli.ParseOptions(*riegeliOptions) 179 failOnErr(err) 180 wr := riegeli.NewWriter(out, opts) 181 failOnErr(wr.PutProto(pb)) 182 failOnErr(wr.Flush()) 183 case delimitedFormat: 184 wr := delimited.NewWriter(out) 185 failOnErr(wr.PutProto(pb)) 186 default: 187 log.Fatalf("Unsupported --write_format=%s", *writeFormat) 188 } 189 case *entrySets: 190 encoder := json.NewEncoder(out) 191 var set entrySet 192 failOnErr(rd(func(entry *spb.Entry) error { 193 if !compare.VNamesEqual(set.Source, entry.Source) || !compare.VNamesEqual(set.Target, entry.Target) || set.EdgeKind != entry.EdgeKind { 194 if len(set.Properties) != 0 { 195 if err := encoder.Encode(set); err != nil { 196 return err 197 } 198 } 199 set.Source = entry.Source 200 set.EdgeKind = entry.EdgeKind 201 set.Target = entry.Target 202 set.Properties = make(map[string]json.RawMessage) 203 } 204 var err error 205 if *structuredFacts { 206 set.Properties[entry.FactName], err = stream.StructuredFactValueJSON(entry) 207 } else { 208 set.Properties[entry.FactName], err = json.Marshal(entry) 209 } 210 return err 211 })) 212 if len(set.Properties) != 0 { 213 failOnErr(encoder.Encode(set)) 214 } 215 default: 216 switch *writeFormat { 217 case jsonFormat: 218 encoder := json.NewEncoder(out) 219 failOnErr(rd(func(entry *spb.Entry) error { 220 if *structuredFacts { 221 return encoder.Encode(stream.Structured(entry)) 222 } 223 return encoder.Encode(entry) 224 })) 225 case riegeliFormat: 226 opts, err := riegeli.ParseOptions(*riegeliOptions) 227 failOnErr(err) 228 wr := riegeli.NewWriter(out, opts) 229 failOnErr(rd(func(entry *spb.Entry) error { 230 return wr.PutProto(entry) 231 })) 232 failOnErr(wr.Flush()) 233 case delimitedFormat: 234 wr := delimited.NewWriter(out) 235 failOnErr(rd(func(entry *spb.Entry) error { 236 return wr.PutProto(entry) 237 })) 238 case textprotoFormat: 239 entries := &spb.Entries{} 240 failOnErr(rd(func(entry *spb.Entry) error { 241 entries.Entries = append(entries.Entries, entry) 242 return nil 243 })) 244 out.WriteString(prototext.Format(entries)) 245 246 default: 247 log.Fatalf("Unsupported --write_format=%s", *writeFormat) 248 } 249 } 250 failOnErr(out.Flush()) 251 } 252 253 func sortEntries(rd stream.EntryReader) (stream.EntryReader, error) { 254 sorter, err := disksort.NewMergeSorter(disksort.MergeOptions{ 255 Lesser: entryLesser{}, 256 Marshaler: entryMarshaler{}, 257 }) 258 if err != nil { 259 return nil, fmt.Errorf("error creating entries sorter: %v", err) 260 } 261 262 if err := rd(func(e *spb.Entry) error { 263 return sorter.Add(e) 264 }); err != nil { 265 return nil, fmt.Errorf("error sorting entries: %v", err) 266 } 267 268 return func(f func(*spb.Entry) error) error { 269 return sorter.Read(func(i any) error { 270 return f(i.(*spb.Entry)) 271 }) 272 }, nil 273 } 274 275 type entryLesser struct{} 276 277 func (entryLesser) Less(a, b any) bool { 278 return compare.Entries(a.(*spb.Entry), b.(*spb.Entry)) == compare.LT 279 } 280 281 type entryMarshaler struct{} 282 283 func (entryMarshaler) Marshal(x any) ([]byte, error) { return proto.Marshal(x.(proto.Message)) } 284 285 func (entryMarshaler) Unmarshal(rec []byte) (any, error) { 286 var e spb.Entry 287 return &e, proto.Unmarshal(rec, &e) 288 } 289 290 func dedupEntries(rd stream.EntryReader) stream.EntryReader { 291 return func(f func(*spb.Entry) error) error { 292 var last *spb.Entry 293 return rd(func(e *spb.Entry) error { 294 if compare.Entries(last, e) != compare.EQ { 295 last = e 296 return f(e) 297 } 298 return nil 299 }) 300 } 301 } 302 303 func failOnErr(err error) { 304 if err != nil { 305 log.Fatal(err) 306 } 307 }