kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/go/serving/tools/write_tables/write_tables.go (about)

     1  /*
     2   * Copyright 2015 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  // Binary write_tables creates a combined xrefs/filetree/search serving table
    18  // based on a given GraphStore.
    19  package main
    20  
    21  import (
    22  	"context"
    23  	"errors"
    24  	"flag"
    25  	"time"
    26  
    27  	"kythe.io/kythe/go/platform/vfs"
    28  	"kythe.io/kythe/go/services/graphstore"
    29  	"kythe.io/kythe/go/serving/pipeline"
    30  	"kythe.io/kythe/go/serving/pipeline/beamio"
    31  	"kythe.io/kythe/go/serving/xrefs"
    32  	"kythe.io/kythe/go/storage/gsutil"
    33  	"kythe.io/kythe/go/storage/leveldb"
    34  	"kythe.io/kythe/go/storage/stream"
    35  	"kythe.io/kythe/go/util/flagutil"
    36  	"kythe.io/kythe/go/util/log"
    37  	"kythe.io/kythe/go/util/profile"
    38  
    39  	"github.com/apache/beam/sdks/go/pkg/beam"
    40  	"github.com/apache/beam/sdks/go/pkg/beam/transforms/stats"
    41  	"github.com/apache/beam/sdks/go/pkg/beam/x/beamx"
    42  
    43  	spb "kythe.io/kythe/proto/storage_go_proto"
    44  
    45  	_ "kythe.io/kythe/go/services/graphstore/proxy"
    46  	_ "kythe.io/third_party/beam/sdks/go/pkg/beam/runners/disksort"
    47  )
    48  
    49  var (
    50  	gs          graphstore.Service
    51  	entriesFile = flag.String("entries", "",
    52  		"In non-beam mode: path to GraphStore-ordered entries file (mutually exclusive with --graphstore).\n"+
    53  			"In beam mode: path to an unordered entries file, or if ending with slash, a directory containing such files.")
    54  
    55  	tablePath = flag.String("out", "", "Directory path to output serving table")
    56  
    57  	maxPageSize = flag.Int("max_page_size", 4000,
    58  		"If positive, edge/cross-reference pages are restricted to under this number of edges/references")
    59  	compressShards = flag.Bool("compress_shards", false,
    60  		"Determines whether intermediate data written to disk should be compressed.")
    61  	maxShardSize = flag.Int("max_shard_size", 32000,
    62  		"Maximum number of elements (edges, decoration fragments, etc.) to keep in-memory before flushing an intermediary data shard to disk.")
    63  
    64  	verbose = flag.Bool("verbose", false, "Whether to emit extra, and possibly excessive, log messages")
    65  
    66  	experimentalBeamPipeline = flag.Bool("experimental_beam_pipeline", false, "Whether to use the Beam experimental pipeline implementation")
    67  	beamShards               = flag.Int("beam_shards", 0, "Number of shards for beam processing. If non-positive, a reasonable default will be chosen.")
    68  	beamK                    = flag.Int("beam_k", 0, "Amount of memory to use when determining level DB shards. This will keep at most approximately O(k*log(n/k)) database keys in memory at once on a single machine, where n represents the total number of keys written to the database. As this value increases, elements will be distributed more evenly between the shards. By default, this will be set to the number of shards.")
    69  	beamInternalSharding     flagutil.IntList
    70  	experimentalColumnarData = flag.Bool("experimental_beam_columnar_data", false, "Whether to emit columnar data from the Beam pipeline implementation")
    71  	compactTable             = flag.Bool("compact_table", false, "Whether to compact the output LevelDB after its creation")
    72  )
    73  
    74  func init() {
    75  	flag.Var(&beamInternalSharding, "beam_internal_sharding", "Controls how database keys are sharded in memory during processing. If the beam pipeline is running out of memory, use this to increase parallelism. Can be specified repeatedly for more control over shard computation. For example, if specified with -beam_internal_sharding 16 -beam_internal_sharding 4, the beam pipeline can use up to 16 machines to compute intermediate sharding information, then up to 4, then 1 to produce the final output. If unspecified, all database keys will be combined on a single machine to compute LevelDB shards.")
    76  	gsutil.Flag(&gs, "graphstore", "GraphStore to read (mutually exclusive with --entries)")
    77  	flag.Usage = flagutil.SimpleUsage(
    78  		"Creates a combined xrefs/filetree/search serving table based on a given GraphStore or stream of GraphStore-ordered entries",
    79  		"(--graphstore spec | --entries path) --out path")
    80  }
    81  
    82  func main() {
    83  	flag.Parse()
    84  	beam.Init()
    85  	ctx := context.Background()
    86  	if *experimentalBeamPipeline {
    87  		if err := runExperimentalBeamPipeline(ctx); err != nil {
    88  			log.Fatalf("Pipeline error: %v", err)
    89  		}
    90  		if *compactTable {
    91  			if err := compactLevelDB(*tablePath); err != nil {
    92  				log.Fatalf("Error compacting LevelDB: %v", err)
    93  			}
    94  		}
    95  		return
    96  	}
    97  
    98  	if gs == nil && *entriesFile == "" {
    99  		flagutil.UsageError("missing --graphstore or --entries")
   100  	} else if gs != nil && *entriesFile != "" {
   101  		flagutil.UsageError("--graphstore and --entries are mutually exclusive")
   102  	} else if *tablePath == "" {
   103  		flagutil.UsageError("missing required --out flag")
   104  	}
   105  
   106  	db, err := leveldb.Open(*tablePath, nil)
   107  	if err != nil {
   108  		log.Fatal(err)
   109  	}
   110  	defer db.Close(ctx)
   111  
   112  	if err := profile.Start(ctx); err != nil {
   113  		log.Fatal(err)
   114  	}
   115  	defer profile.Stop()
   116  
   117  	var rd stream.EntryReader
   118  	if gs != nil {
   119  		rd = func(f func(e *spb.Entry) error) error {
   120  			defer gs.Close(ctx)
   121  			return gs.Scan(ctx, &spb.ScanRequest{}, f)
   122  		}
   123  	} else {
   124  		f, err := vfs.Open(ctx, *entriesFile)
   125  		if err != nil {
   126  			log.Fatalf("Error opening %q: %v", *entriesFile, err)
   127  		}
   128  		defer f.Close()
   129  		rd = stream.NewReader(f)
   130  	}
   131  
   132  	if err := pipeline.Run(ctx, rd, db, &pipeline.Options{
   133  		Verbose:        *verbose,
   134  		MaxPageSize:    *maxPageSize,
   135  		CompressShards: *compressShards,
   136  		MaxShardSize:   *maxShardSize,
   137  	}); err != nil {
   138  		log.Fatal("FATAL ERROR: ", err)
   139  	}
   140  
   141  	if *compactTable {
   142  		if err := compactLevelDB(*tablePath); err != nil {
   143  			log.Fatalf("Error compacting LevelDB: %v", err)
   144  		}
   145  	}
   146  }
   147  
   148  func compactLevelDB(path string) error {
   149  	defer func(start time.Time) { log.Infof("Compaction completed in %s", time.Since(start)) }(time.Now())
   150  	return leveldb.CompactRange(*tablePath, nil)
   151  }
   152  
   153  func runExperimentalBeamPipeline(ctx context.Context) error {
   154  	if runnerFlag := flag.Lookup("runner"); runnerFlag.Value.String() == "direct" {
   155  		runnerFlag.Value.Set("disksort")
   156  	}
   157  
   158  	if gs != nil {
   159  		return errors.New("--graphstore input not supported with --experimental_beam_pipeline")
   160  	} else if *entriesFile == "" {
   161  		return errors.New("--entries file path required")
   162  	} else if *tablePath == "" {
   163  		return errors.New("--out table path required")
   164  	}
   165  
   166  	p, s := beam.NewPipelineWithRoot()
   167  	entries, err := beamio.ReadEntries(ctx, s, *entriesFile)
   168  	if err != nil {
   169  		log.Fatal("Error reading entries: ", err)
   170  	}
   171  	k := pipeline.FromEntries(s, entries)
   172  	shards := *beamShards
   173  	if shards <= 0 {
   174  		// TODO(schroederc): better determine number of shards
   175  		shards = 128
   176  	}
   177  	statsK := *beamK
   178  	if statsK == 0 {
   179  		statsK = shards
   180  	}
   181  	opts := stats.Opts{
   182  		K:                statsK,
   183  		InternalSharding: beamInternalSharding,
   184  		NumQuantiles:     shards,
   185  	}
   186  	if *experimentalColumnarData {
   187  		beamio.WriteLevelDB(s, *tablePath, opts,
   188  			createColumnarMetadata(s),
   189  			k.SplitCrossReferences(),
   190  			k.SplitDecorations(),
   191  			k.CorpusRoots(),
   192  			k.Directories(),
   193  			k.Documents(),
   194  			k.SplitEdges(),
   195  		)
   196  	} else {
   197  		edgeSets, edgePages := k.Edges()
   198  		xrefSets, xrefPages := k.CrossReferences()
   199  		beamio.WriteLevelDB(s, *tablePath, opts,
   200  			k.CorpusRoots(),
   201  			k.Decorations(),
   202  			k.Directories(),
   203  			k.Documents(),
   204  			xrefSets, xrefPages,
   205  			edgeSets, edgePages,
   206  		)
   207  	}
   208  
   209  	return beamx.Run(ctx, p)
   210  }
   211  
   212  func init() {
   213  	beam.RegisterFunction(emitColumnarMetadata)
   214  }
   215  
   216  func createColumnarMetadata(s beam.Scope) beam.PCollection {
   217  	return beam.ParDo(s, emitColumnarMetadata, beam.Impulse(s))
   218  }
   219  
   220  func emitColumnarMetadata(_ []byte) (string, string) { return xrefs.ColumnarTableKeyMarker, "v1" }