kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/go/serving/tools/write_tables/write_tables.go (about) 1 /* 2 * Copyright 2015 The Kythe Authors. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 // Binary write_tables creates a combined xrefs/filetree/search serving table 18 // based on a given GraphStore. 19 package main 20 21 import ( 22 "context" 23 "errors" 24 "flag" 25 "time" 26 27 "kythe.io/kythe/go/platform/vfs" 28 "kythe.io/kythe/go/services/graphstore" 29 "kythe.io/kythe/go/serving/pipeline" 30 "kythe.io/kythe/go/serving/pipeline/beamio" 31 "kythe.io/kythe/go/serving/xrefs" 32 "kythe.io/kythe/go/storage/gsutil" 33 "kythe.io/kythe/go/storage/leveldb" 34 "kythe.io/kythe/go/storage/stream" 35 "kythe.io/kythe/go/util/flagutil" 36 "kythe.io/kythe/go/util/log" 37 "kythe.io/kythe/go/util/profile" 38 39 "github.com/apache/beam/sdks/go/pkg/beam" 40 "github.com/apache/beam/sdks/go/pkg/beam/transforms/stats" 41 "github.com/apache/beam/sdks/go/pkg/beam/x/beamx" 42 43 spb "kythe.io/kythe/proto/storage_go_proto" 44 45 _ "kythe.io/kythe/go/services/graphstore/proxy" 46 _ "kythe.io/third_party/beam/sdks/go/pkg/beam/runners/disksort" 47 ) 48 49 var ( 50 gs graphstore.Service 51 entriesFile = flag.String("entries", "", 52 "In non-beam mode: path to GraphStore-ordered entries file (mutually exclusive with --graphstore).\n"+ 53 "In beam mode: path to an unordered entries file, or if ending with slash, a directory containing such files.") 54 55 tablePath = flag.String("out", "", "Directory path to output serving table") 56 57 maxPageSize = flag.Int("max_page_size", 4000, 58 "If positive, edge/cross-reference pages are restricted to under this number of edges/references") 59 compressShards = flag.Bool("compress_shards", false, 60 "Determines whether intermediate data written to disk should be compressed.") 61 maxShardSize = flag.Int("max_shard_size", 32000, 62 "Maximum number of elements (edges, decoration fragments, etc.) to keep in-memory before flushing an intermediary data shard to disk.") 63 64 verbose = flag.Bool("verbose", false, "Whether to emit extra, and possibly excessive, log messages") 65 66 experimentalBeamPipeline = flag.Bool("experimental_beam_pipeline", false, "Whether to use the Beam experimental pipeline implementation") 67 beamShards = flag.Int("beam_shards", 0, "Number of shards for beam processing. If non-positive, a reasonable default will be chosen.") 68 beamK = flag.Int("beam_k", 0, "Amount of memory to use when determining level DB shards. This will keep at most approximately O(k*log(n/k)) database keys in memory at once on a single machine, where n represents the total number of keys written to the database. As this value increases, elements will be distributed more evenly between the shards. By default, this will be set to the number of shards.") 69 beamInternalSharding flagutil.IntList 70 experimentalColumnarData = flag.Bool("experimental_beam_columnar_data", false, "Whether to emit columnar data from the Beam pipeline implementation") 71 compactTable = flag.Bool("compact_table", false, "Whether to compact the output LevelDB after its creation") 72 ) 73 74 func init() { 75 flag.Var(&beamInternalSharding, "beam_internal_sharding", "Controls how database keys are sharded in memory during processing. If the beam pipeline is running out of memory, use this to increase parallelism. Can be specified repeatedly for more control over shard computation. For example, if specified with -beam_internal_sharding 16 -beam_internal_sharding 4, the beam pipeline can use up to 16 machines to compute intermediate sharding information, then up to 4, then 1 to produce the final output. If unspecified, all database keys will be combined on a single machine to compute LevelDB shards.") 76 gsutil.Flag(&gs, "graphstore", "GraphStore to read (mutually exclusive with --entries)") 77 flag.Usage = flagutil.SimpleUsage( 78 "Creates a combined xrefs/filetree/search serving table based on a given GraphStore or stream of GraphStore-ordered entries", 79 "(--graphstore spec | --entries path) --out path") 80 } 81 82 func main() { 83 flag.Parse() 84 beam.Init() 85 ctx := context.Background() 86 if *experimentalBeamPipeline { 87 if err := runExperimentalBeamPipeline(ctx); err != nil { 88 log.Fatalf("Pipeline error: %v", err) 89 } 90 if *compactTable { 91 if err := compactLevelDB(*tablePath); err != nil { 92 log.Fatalf("Error compacting LevelDB: %v", err) 93 } 94 } 95 return 96 } 97 98 if gs == nil && *entriesFile == "" { 99 flagutil.UsageError("missing --graphstore or --entries") 100 } else if gs != nil && *entriesFile != "" { 101 flagutil.UsageError("--graphstore and --entries are mutually exclusive") 102 } else if *tablePath == "" { 103 flagutil.UsageError("missing required --out flag") 104 } 105 106 db, err := leveldb.Open(*tablePath, nil) 107 if err != nil { 108 log.Fatal(err) 109 } 110 defer db.Close(ctx) 111 112 if err := profile.Start(ctx); err != nil { 113 log.Fatal(err) 114 } 115 defer profile.Stop() 116 117 var rd stream.EntryReader 118 if gs != nil { 119 rd = func(f func(e *spb.Entry) error) error { 120 defer gs.Close(ctx) 121 return gs.Scan(ctx, &spb.ScanRequest{}, f) 122 } 123 } else { 124 f, err := vfs.Open(ctx, *entriesFile) 125 if err != nil { 126 log.Fatalf("Error opening %q: %v", *entriesFile, err) 127 } 128 defer f.Close() 129 rd = stream.NewReader(f) 130 } 131 132 if err := pipeline.Run(ctx, rd, db, &pipeline.Options{ 133 Verbose: *verbose, 134 MaxPageSize: *maxPageSize, 135 CompressShards: *compressShards, 136 MaxShardSize: *maxShardSize, 137 }); err != nil { 138 log.Fatal("FATAL ERROR: ", err) 139 } 140 141 if *compactTable { 142 if err := compactLevelDB(*tablePath); err != nil { 143 log.Fatalf("Error compacting LevelDB: %v", err) 144 } 145 } 146 } 147 148 func compactLevelDB(path string) error { 149 defer func(start time.Time) { log.Infof("Compaction completed in %s", time.Since(start)) }(time.Now()) 150 return leveldb.CompactRange(*tablePath, nil) 151 } 152 153 func runExperimentalBeamPipeline(ctx context.Context) error { 154 if runnerFlag := flag.Lookup("runner"); runnerFlag.Value.String() == "direct" { 155 runnerFlag.Value.Set("disksort") 156 } 157 158 if gs != nil { 159 return errors.New("--graphstore input not supported with --experimental_beam_pipeline") 160 } else if *entriesFile == "" { 161 return errors.New("--entries file path required") 162 } else if *tablePath == "" { 163 return errors.New("--out table path required") 164 } 165 166 p, s := beam.NewPipelineWithRoot() 167 entries, err := beamio.ReadEntries(ctx, s, *entriesFile) 168 if err != nil { 169 log.Fatal("Error reading entries: ", err) 170 } 171 k := pipeline.FromEntries(s, entries) 172 shards := *beamShards 173 if shards <= 0 { 174 // TODO(schroederc): better determine number of shards 175 shards = 128 176 } 177 statsK := *beamK 178 if statsK == 0 { 179 statsK = shards 180 } 181 opts := stats.Opts{ 182 K: statsK, 183 InternalSharding: beamInternalSharding, 184 NumQuantiles: shards, 185 } 186 if *experimentalColumnarData { 187 beamio.WriteLevelDB(s, *tablePath, opts, 188 createColumnarMetadata(s), 189 k.SplitCrossReferences(), 190 k.SplitDecorations(), 191 k.CorpusRoots(), 192 k.Directories(), 193 k.Documents(), 194 k.SplitEdges(), 195 ) 196 } else { 197 edgeSets, edgePages := k.Edges() 198 xrefSets, xrefPages := k.CrossReferences() 199 beamio.WriteLevelDB(s, *tablePath, opts, 200 k.CorpusRoots(), 201 k.Decorations(), 202 k.Directories(), 203 k.Documents(), 204 xrefSets, xrefPages, 205 edgeSets, edgePages, 206 ) 207 } 208 209 return beamx.Run(ctx, p) 210 } 211 212 func init() { 213 beam.RegisterFunction(emitColumnarMetadata) 214 } 215 216 func createColumnarMetadata(s beam.Scope) beam.PCollection { 217 return beam.ParDo(s, emitColumnarMetadata, beam.Impulse(s)) 218 } 219 220 func emitColumnarMetadata(_ []byte) (string, string) { return xrefs.ColumnarTableKeyMarker, "v1" }