github.com/dshekhar95/sub_dgraph@v0.0.0-20230424164411-6be28e40bbf1/dgraph/cmd/bulk/run.go

github.com/dshekhar95/sub_dgraph@v0.0.0-20230424164411-6be28e40bbf1/dgraph/cmd/bulk/run.go (about)

     1  /*
     2   * Copyright 2017-2022 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package bulk
    18  
    19  import (
    20  	"encoding/json"
    21  	"fmt"
    22  	"log"
    23  	"math"
    24  	"net/http"
    25  	_ "net/http/pprof" // http profiler
    26  	"os"
    27  	"path/filepath"
    28  	"runtime"
    29  	"strconv"
    30  	"strings"
    31  
    32  	"github.com/spf13/cobra"
    33  
    34  	"github.com/dgraph-io/badger/v3"
    35  	"github.com/dgraph-io/dgraph/ee"
    36  	"github.com/dgraph-io/dgraph/filestore"
    37  	"github.com/dgraph-io/dgraph/protos/pb"
    38  	"github.com/dgraph-io/dgraph/tok"
    39  	"github.com/dgraph-io/dgraph/worker"
    40  	"github.com/dgraph-io/dgraph/x"
    41  	"github.com/dgraph-io/ristretto/z"
    42  )
    43  
    44  // Bulk is the sub-command invoked when running "dgraph bulk".
    45  var Bulk x.SubCommand
    46  
    47  var defaultOutDir = "./out"
    48  
    49  const BulkBadgerDefaults = "compression=snappy; numgoroutines=8;"
    50  
    51  func init() {
    52  	Bulk.Cmd = &cobra.Command{
    53  		Use:   "bulk",
    54  		Short: "Run Dgraph Bulk Loader",
    55  		Run: func(cmd *cobra.Command, args []string) {
    56  			defer x.StartProfile(Bulk.Conf).Stop()
    57  			run()
    58  		},
    59  		Annotations: map[string]string{"group": "data-load"},
    60  	}
    61  	Bulk.Cmd.SetHelpTemplate(x.NonRootTemplate)
    62  	Bulk.EnvPrefix = "DGRAPH_BULK"
    63  
    64  	flag := Bulk.Cmd.Flags()
    65  	flag.StringP("files", "f", "",
    66  		"Location of *.rdf(.gz) or *.json(.gz) file(s) to load.")
    67  	flag.StringP("schema", "s", "",
    68  		"Location of schema file.")
    69  	flag.StringP("graphql_schema", "g", "", "Location of the GraphQL schema file.")
    70  	flag.String("format", "",
    71  		"Specify file format (rdf or json) instead of getting it from filename.")
    72  	flag.Bool("encrypted", false,
    73  		"Flag to indicate whether schema and data files are encrypted. "+
    74  			"Must be specified with --encryption or vault option(s).")
    75  	flag.Bool("encrypted_out", false,
    76  		"Flag to indicate whether to encrypt the output. "+
    77  			"Must be specified with --encryption or vault option(s).")
    78  	flag.String("out", defaultOutDir,
    79  		"Location to write the final dgraph data directories.")
    80  	flag.Bool("replace_out", false,
    81  		"Replace out directory and its contents if it exists.")
    82  	flag.String("tmp", "tmp",
    83  		"Temp directory used to use for on-disk scratch space. Requires free space proportional"+
    84  			" to the size of the RDF file and the amount of indexing used.")
    85  
    86  	flag.IntP("num_go_routines", "j", int(math.Ceil(float64(runtime.NumCPU())/4.0)),
    87  		"Number of worker threads to use. MORE THREADS LEAD TO HIGHER RAM USAGE.")
    88  	flag.Int64("mapoutput_mb", 2048,
    89  		"The estimated size of each map file output. Increasing this increases memory usage.")
    90  	flag.Int64("partition_mb", 4, "Pick a partition key every N megabytes of data.")
    91  	flag.Bool("skip_map_phase", false,
    92  		"Skip the map phase (assumes that map output files already exist).")
    93  	flag.Bool("cleanup_tmp", true,
    94  		"Clean up the tmp directory after the loader finishes. Setting this to false allows the"+
    95  			" bulk loader can be re-run while skipping the map phase.")
    96  	flag.Int("reducers", 1,
    97  		"Number of reducers to run concurrently. Increasing this can improve performance, and "+
    98  			"must be less than or equal to the number of reduce shards.")
    99  	flag.Bool("version", false, "Prints the version of Dgraph Bulk Loader.")
   100  	flag.Bool("store_xids", false, "Generate an xid edge for each node.")
   101  	flag.StringP("zero", "z", "localhost:5080", "gRPC address for Dgraph zero")
   102  	flag.String("xidmap", "", "Directory to store xid to uid mapping")
   103  	// TODO: Potentially move http server to main.
   104  	flag.String("http", "localhost:8080",
   105  		"Address to serve http (pprof).")
   106  	flag.Bool("ignore_errors", false, "ignore line parsing errors in rdf files")
   107  	flag.Int("map_shards", 1,
   108  		"Number of map output shards. Must be greater than or equal to the number of reduce "+
   109  			"shards. Increasing allows more evenly sized reduce shards, at the expense of "+
   110  			"increased memory usage.")
   111  	flag.Int("reduce_shards", 1,
   112  		"Number of reduce shards. This determines the number of dgraph instances in the final "+
   113  			"cluster. Increasing this potentially decreases the reduce stage runtime by using "+
   114  			"more parallelism, but increases memory usage.")
   115  	flag.String("custom_tokenizers", "",
   116  		"Comma separated list of tokenizer plugins")
   117  	flag.Bool("new_uids", false,
   118  		"Ignore UIDs in load files and assign new ones.")
   119  	flag.Uint64("force-namespace", math.MaxUint64,
   120  		"Namespace onto which to load the data. If not set, will preserve the namespace."+
   121  			" When using this flag to load data into specific namespace, make sure that the "+
   122  			"load data do not have ACL data.")
   123  
   124  	flag.String("badger", BulkBadgerDefaults, z.NewSuperFlagHelp(BulkBadgerDefaults).
   125  		Head("Badger options (Refer to badger documentation for all possible options)").
   126  		Flag("compression",
   127  			"Specifies the compression algorithm and compression level (if applicable) for the "+
   128  				`postings directory. "none" would disable compression, while "zstd:1" would set `+
   129  				"zstd compression at level 1.").
   130  		Flag("numgoroutines",
   131  			"The number of goroutines to use in badger.Stream.").
   132  		String())
   133  
   134  	x.RegisterClientTLSFlags(flag)
   135  	// Encryption and Vault options
   136  	ee.RegisterEncFlag(flag)
   137  }
   138  
   139  func run() {
   140  	cacheSize := 64 << 20 // These are the default values. User can overwrite them using --badger.
   141  	cacheDefaults := fmt.Sprintf("indexcachesize=%d; blockcachesize=%d; ",
   142  		(70*cacheSize)/100, (30*cacheSize)/100)
   143  
   144  	bopts := badger.DefaultOptions("").FromSuperFlag(BulkBadgerDefaults + cacheDefaults).
   145  		FromSuperFlag(Bulk.Conf.GetString("badger"))
   146  	keys, err := ee.GetKeys(Bulk.Conf)
   147  	x.Check(err)
   148  
   149  	opt := options{
   150  		DataFiles:        Bulk.Conf.GetString("files"),
   151  		DataFormat:       Bulk.Conf.GetString("format"),
   152  		EncryptionKey:    keys.EncKey,
   153  		SchemaFile:       Bulk.Conf.GetString("schema"),
   154  		GqlSchemaFile:    Bulk.Conf.GetString("graphql_schema"),
   155  		Encrypted:        Bulk.Conf.GetBool("encrypted"),
   156  		EncryptedOut:     Bulk.Conf.GetBool("encrypted_out"),
   157  		OutDir:           Bulk.Conf.GetString("out"),
   158  		ReplaceOutDir:    Bulk.Conf.GetBool("replace_out"),
   159  		TmpDir:           Bulk.Conf.GetString("tmp"),
   160  		NumGoroutines:    Bulk.Conf.GetInt("num_go_routines"),
   161  		MapBufSize:       uint64(Bulk.Conf.GetInt("mapoutput_mb")),
   162  		PartitionBufSize: int64(Bulk.Conf.GetInt("partition_mb")),
   163  		SkipMapPhase:     Bulk.Conf.GetBool("skip_map_phase"),
   164  		CleanupTmp:       Bulk.Conf.GetBool("cleanup_tmp"),
   165  		NumReducers:      Bulk.Conf.GetInt("reducers"),
   166  		Version:          Bulk.Conf.GetBool("version"),
   167  		StoreXids:        Bulk.Conf.GetBool("store_xids"),
   168  		ZeroAddr:         Bulk.Conf.GetString("zero"),
   169  		HttpAddr:         Bulk.Conf.GetString("http"),
   170  		IgnoreErrors:     Bulk.Conf.GetBool("ignore_errors"),
   171  		MapShards:        Bulk.Conf.GetInt("map_shards"),
   172  		ReduceShards:     Bulk.Conf.GetInt("reduce_shards"),
   173  		CustomTokenizers: Bulk.Conf.GetString("custom_tokenizers"),
   174  		NewUids:          Bulk.Conf.GetBool("new_uids"),
   175  		ClientDir:        Bulk.Conf.GetString("xidmap"),
   176  		Namespace:        Bulk.Conf.GetUint64("force-namespace"),
   177  		Badger:           bopts,
   178  	}
   179  
   180  	x.PrintVersion()
   181  	if opt.Version {
   182  		os.Exit(0)
   183  	}
   184  
   185  	if len(opt.EncryptionKey) == 0 {
   186  		if opt.Encrypted || opt.EncryptedOut {
   187  			fmt.Fprint(os.Stderr, "Must use --encryption or vault option(s).\n")
   188  			os.Exit(1)
   189  		}
   190  	} else {
   191  		requiredFlags := Bulk.Cmd.Flags().Changed("encrypted") &&
   192  			Bulk.Cmd.Flags().Changed("encrypted_out")
   193  		if !requiredFlags {
   194  			fmt.Fprint(os.Stderr,
   195  				"Must specify --encrypted and --encrypted_out when providing encryption key.\n")
   196  			os.Exit(1)
   197  		}
   198  		if !opt.Encrypted && !opt.EncryptedOut {
   199  			fmt.Fprint(os.Stderr,
   200  				"Must set --encrypted and/or --encrypted_out to true when providing encryption key.\n")
   201  			os.Exit(1)
   202  		}
   203  
   204  		tlsConf, err := x.LoadClientTLSConfigForInternalPort(Bulk.Conf)
   205  		x.Check(err)
   206  		// Need to set zero addr in WorkerConfig before checking the license.
   207  		x.WorkerConfig.ZeroAddr = []string{opt.ZeroAddr}
   208  		x.WorkerConfig.TLSClientConfig = tlsConf
   209  		if !worker.EnterpriseEnabled() {
   210  			// Crash since the enterprise license is not enabled..
   211  			log.Fatal("Enterprise License needed for the Encryption feature.")
   212  		} else {
   213  			log.Printf("Encryption feature enabled.")
   214  		}
   215  	}
   216  	fmt.Printf("Encrypted input: %v; Encrypted output: %v\n", opt.Encrypted, opt.EncryptedOut)
   217  
   218  	if opt.SchemaFile == "" {
   219  		fmt.Fprint(os.Stderr, "Schema file must be specified.\n")
   220  		os.Exit(1)
   221  	}
   222  	if !filestore.Exists(opt.SchemaFile) {
   223  		fmt.Fprintf(os.Stderr, "Schema path(%v) does not exist.\n", opt.SchemaFile)
   224  		os.Exit(1)
   225  	}
   226  	if opt.DataFiles == "" {
   227  		fmt.Fprint(os.Stderr, "RDF or JSON file(s) location must be specified.\n")
   228  		os.Exit(1)
   229  	} else {
   230  		fileList := strings.Split(opt.DataFiles, ",")
   231  		for _, file := range fileList {
   232  			if !filestore.Exists(file) {
   233  				fmt.Fprintf(os.Stderr, "Data path(%v) does not exist.\n", file)
   234  				os.Exit(1)
   235  			}
   236  		}
   237  	}
   238  
   239  	if opt.ReduceShards > opt.MapShards {
   240  		fmt.Fprintf(os.Stderr, "Invalid flags: reduce_shards(%d) should be <= map_shards(%d)\n",
   241  			opt.ReduceShards, opt.MapShards)
   242  		os.Exit(1)
   243  	}
   244  	if opt.NumReducers > opt.ReduceShards {
   245  		fmt.Fprintf(os.Stderr, "Invalid flags: shufflers(%d) should be <= reduce_shards(%d)\n",
   246  			opt.NumReducers, opt.ReduceShards)
   247  		os.Exit(1)
   248  	}
   249  	if opt.CustomTokenizers != "" {
   250  		for _, soFile := range strings.Split(opt.CustomTokenizers, ",") {
   251  			tok.LoadCustomTokenizer(soFile)
   252  		}
   253  	}
   254  	if opt.MapBufSize <= 0 || opt.PartitionBufSize <= 0 {
   255  		fmt.Fprintf(os.Stderr, "mapoutput_mb: %d and partition_mb: %d must be greater than zero\n",
   256  			opt.MapBufSize, opt.PartitionBufSize)
   257  		os.Exit(1)
   258  	}
   259  
   260  	opt.MapBufSize <<= 20       // Convert from MB to B.
   261  	opt.PartitionBufSize <<= 20 // Convert from MB to B.
   262  
   263  	optBuf, err := json.MarshalIndent(&opt, "", "\t")
   264  	x.Check(err)
   265  	fmt.Println(string(optBuf))
   266  
   267  	maxOpenFilesWarning()
   268  
   269  	go func() {
   270  		log.Fatal(http.ListenAndServe(opt.HttpAddr, nil))
   271  	}()
   272  	http.HandleFunc("/jemalloc", x.JemallocHandler)
   273  
   274  	// Make sure it's OK to create or replace the directory specified with the --out option.
   275  	// It is always OK to create or replace the default output directory.
   276  	if opt.OutDir != defaultOutDir && !opt.ReplaceOutDir {
   277  		err := x.IsMissingOrEmptyDir(opt.OutDir)
   278  		if err == nil {
   279  			fmt.Fprintf(os.Stderr, "Output directory exists and is not empty."+
   280  				" Use --replace_out to overwrite it.\n")
   281  			os.Exit(1)
   282  		} else if err != x.ErrMissingDir {
   283  			x.CheckfNoTrace(err)
   284  		}
   285  	}
   286  
   287  	// Delete and recreate the output dirs to ensure they are empty.
   288  	x.Check(os.RemoveAll(opt.OutDir))
   289  	for i := 0; i < opt.ReduceShards; i++ {
   290  		dir := filepath.Join(opt.OutDir, strconv.Itoa(i), "p")
   291  		x.Check(os.MkdirAll(dir, 0700))
   292  		opt.shardOutputDirs = append(opt.shardOutputDirs, dir)
   293  
   294  		x.Check(x.WriteGroupIdFile(dir, uint32(i+1)))
   295  	}
   296  
   297  	// Create a directory just for bulk loader's usage.
   298  	if !opt.SkipMapPhase {
   299  		x.Check(os.RemoveAll(opt.TmpDir))
   300  		x.Check(os.MkdirAll(opt.TmpDir, 0700))
   301  	}
   302  	if opt.CleanupTmp {
   303  		defer os.RemoveAll(opt.TmpDir)
   304  	}
   305  
   306  	// Create directory for temporary buffers used in map-reduce phase
   307  	bufDir := filepath.Join(opt.TmpDir, bufferDir)
   308  	x.Check(os.RemoveAll(bufDir))
   309  	x.Check(os.MkdirAll(bufDir, 0700))
   310  	defer os.RemoveAll(bufDir)
   311  
   312  	loader := newLoader(&opt)
   313  
   314  	const bulkMetaFilename = "bulk.meta"
   315  	bulkMetaPath := filepath.Join(opt.TmpDir, bulkMetaFilename)
   316  
   317  	if opt.SkipMapPhase {
   318  		bulkMetaData, err := os.ReadFile(bulkMetaPath)
   319  		if err != nil {
   320  			fmt.Fprintln(os.Stderr, "Error reading from bulk meta file")
   321  			os.Exit(1)
   322  		}
   323  
   324  		var bulkMeta pb.BulkMeta
   325  		if err = bulkMeta.Unmarshal(bulkMetaData); err != nil {
   326  			fmt.Fprintln(os.Stderr, "Error deserializing bulk meta file")
   327  			os.Exit(1)
   328  		}
   329  
   330  		loader.prog.mapEdgeCount = bulkMeta.EdgeCount
   331  		loader.schema.schemaMap = bulkMeta.SchemaMap
   332  		loader.schema.types = bulkMeta.Types
   333  	} else {
   334  		loader.mapStage()
   335  		mergeMapShardsIntoReduceShards(&opt)
   336  		loader.leaseNamespaces()
   337  
   338  		bulkMeta := pb.BulkMeta{
   339  			EdgeCount: loader.prog.mapEdgeCount,
   340  			SchemaMap: loader.schema.schemaMap,
   341  			Types:     loader.schema.types,
   342  		}
   343  		bulkMetaData, err := bulkMeta.Marshal()
   344  		if err != nil {
   345  			fmt.Fprintln(os.Stderr, "Error serializing bulk meta file")
   346  			os.Exit(1)
   347  		}
   348  		if err = os.WriteFile(bulkMetaPath, bulkMetaData, 0600); err != nil {
   349  			fmt.Fprintln(os.Stderr, "Error writing to bulk meta file")
   350  			os.Exit(1)
   351  		}
   352  	}
   353  	loader.reduceStage()
   354  	loader.writeSchema()
   355  	loader.cleanup()
   356  }
   357  
   358  func maxOpenFilesWarning() {
   359  	const (
   360  		red    = "\x1b[31m"
   361  		green  = "\x1b[32m"
   362  		yellow = "\x1b[33m"
   363  		reset  = "\x1b[0m"
   364  	)
   365  	maxOpenFiles, err := x.QueryMaxOpenFiles()
   366  	if err != nil || maxOpenFiles < 1e6 {
   367  		fmt.Println(green + "\nThe bulk loader needs to open many files at once. This number depends" +
   368  			" on the size of the data set loaded, the map file output size, and the level" +
   369  			" of indexing. 100,000 is adequate for most data set sizes. See `man ulimit` for" +
   370  			" details of how to change the limit.")
   371  		if err != nil {
   372  			fmt.Printf(red+"Nonfatal error: max open file limit could not be detected: %v\n"+reset, err)
   373  		} else {
   374  			fmt.Printf(yellow+"Current max open files limit: %d\n"+reset, maxOpenFiles)
   375  		}
   376  		fmt.Println()
   377  	}
   378  }