github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/dgraph/cmd/bulk/run.go (about)

     1  /*
     2   * Copyright 2017-2018 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package bulk
    18  
    19  import (
    20  	"encoding/json"
    21  	"fmt"
    22  	"log"
    23  	"math"
    24  	"net/http"
    25  	_ "net/http/pprof" // http profiler
    26  	"os"
    27  	"path/filepath"
    28  	"runtime"
    29  	"strconv"
    30  	"strings"
    31  	"time"
    32  
    33  	"github.com/dgraph-io/dgraph/tok"
    34  	"github.com/dgraph-io/dgraph/x"
    35  	"github.com/dustin/go-humanize"
    36  	"github.com/spf13/cobra"
    37  )
    38  
    39  // Bulk is the sub-command invoked when running "dgraph bulk".
    40  var Bulk x.SubCommand
    41  
    42  var defaultOutDir = "./out"
    43  
    44  func init() {
    45  	Bulk.Cmd = &cobra.Command{
    46  		Use:   "bulk",
    47  		Short: "Run Dgraph bulk loader",
    48  		Run: func(cmd *cobra.Command, args []string) {
    49  			defer x.StartProfile(Bulk.Conf).Stop()
    50  			run()
    51  		},
    52  	}
    53  	Bulk.EnvPrefix = "DGRAPH_BULK"
    54  
    55  	flag := Bulk.Cmd.Flags()
    56  	flag.StringP("files", "f", "",
    57  		"Location of *.rdf(.gz) or *.json(.gz) file(s) to load.")
    58  	flag.StringP("schema", "s", "",
    59  		"Location of schema file.")
    60  	flag.String("format", "",
    61  		"Specify file format (rdf or json) instead of getting it from filename.")
    62  	flag.String("out", defaultOutDir,
    63  		"Location to write the final dgraph data directories.")
    64  	flag.Bool("replace_out", false,
    65  		"Replace out directory and its contents if it exists.")
    66  	flag.String("tmp", "tmp",
    67  		"Temp directory used to use for on-disk scratch space. Requires free space proportional"+
    68  			" to the size of the RDF file and the amount of indexing used.")
    69  	flag.IntP("num_go_routines", "j", int(math.Ceil(float64(runtime.NumCPU())/4.0)),
    70  		"Number of worker threads to use. MORE THREADS LEAD TO HIGHER RAM USAGE.")
    71  	flag.Int64("mapoutput_mb", 64,
    72  		"The estimated size of each map file output. Increasing this increases memory usage.")
    73  	flag.Bool("skip_map_phase", false,
    74  		"Skip the map phase (assumes that map output files already exist).")
    75  	flag.Bool("cleanup_tmp", true,
    76  		"Clean up the tmp directory after the loader finishes. Setting this to false allows the"+
    77  			" bulk loader can be re-run while skipping the map phase.")
    78  	flag.Int("reducers", 1,
    79  		"Number of reducers to run concurrently. Increasing this can improve performance, and "+
    80  			"must be less than or equal to the number of reduce shards.")
    81  	flag.Bool("version", false, "Prints the version of Dgraph Bulk Loader.")
    82  	flag.BoolP("store_xids", "x", false, "Generate an xid edge for each node.")
    83  	flag.StringP("zero", "z", "localhost:5080", "gRPC address for Dgraph zero")
    84  	// TODO: Potentially move http server to main.
    85  	flag.String("http", "localhost:8080",
    86  		"Address to serve http (pprof).")
    87  	flag.Bool("ignore_errors", false, "ignore line parsing errors in rdf files")
    88  	flag.Int("map_shards", 1,
    89  		"Number of map output shards. Must be greater than or equal to the number of reduce "+
    90  			"shards. Increasing allows more evenly sized reduce shards, at the expense of "+
    91  			"increased memory usage.")
    92  	flag.Int("reduce_shards", 1,
    93  		"Number of reduce shards. This determines the number of dgraph instances in the final "+
    94  			"cluster. Increasing this potentially decreases the reduce stage runtime by using "+
    95  			"more parallelism, but increases memory usage.")
    96  	flag.String("custom_tokenizers", "",
    97  		"Comma separated list of tokenizer plugins")
    98  	flag.Bool("new_uids", false,
    99  		"Ignore UIDs in load files and assign new ones.")
   100  }
   101  
   102  func run() {
   103  	opt := options{
   104  		DataFiles:        Bulk.Conf.GetString("files"),
   105  		DataFormat:       Bulk.Conf.GetString("format"),
   106  		SchemaFile:       Bulk.Conf.GetString("schema"),
   107  		OutDir:           Bulk.Conf.GetString("out"),
   108  		ReplaceOutDir:    Bulk.Conf.GetBool("replace_out"),
   109  		TmpDir:           Bulk.Conf.GetString("tmp"),
   110  		NumGoroutines:    Bulk.Conf.GetInt("num_go_routines"),
   111  		MapBufSize:       uint64(Bulk.Conf.GetInt("mapoutput_mb")),
   112  		SkipMapPhase:     Bulk.Conf.GetBool("skip_map_phase"),
   113  		CleanupTmp:       Bulk.Conf.GetBool("cleanup_tmp"),
   114  		NumReducers:      Bulk.Conf.GetInt("reducers"),
   115  		Version:          Bulk.Conf.GetBool("version"),
   116  		StoreXids:        Bulk.Conf.GetBool("store_xids"),
   117  		ZeroAddr:         Bulk.Conf.GetString("zero"),
   118  		HttpAddr:         Bulk.Conf.GetString("http"),
   119  		IgnoreErrors:     Bulk.Conf.GetBool("ignore_errors"),
   120  		MapShards:        Bulk.Conf.GetInt("map_shards"),
   121  		ReduceShards:     Bulk.Conf.GetInt("reduce_shards"),
   122  		CustomTokenizers: Bulk.Conf.GetString("custom_tokenizers"),
   123  		NewUids:          Bulk.Conf.GetBool("new_uids"),
   124  	}
   125  
   126  	x.PrintVersion()
   127  	if opt.Version {
   128  		os.Exit(0)
   129  	}
   130  	if opt.SchemaFile == "" {
   131  		fmt.Fprint(os.Stderr, "Schema file must be specified.\n")
   132  		os.Exit(1)
   133  	} else if _, err := os.Stat(opt.SchemaFile); err != nil && os.IsNotExist(err) {
   134  		fmt.Fprintf(os.Stderr, "Schema path(%v) does not exist.\n", opt.SchemaFile)
   135  		os.Exit(1)
   136  	}
   137  	if opt.DataFiles == "" {
   138  		fmt.Fprint(os.Stderr, "RDF or JSON file(s) location must be specified.\n")
   139  		os.Exit(1)
   140  	} else {
   141  		fileList := strings.Split(opt.DataFiles, ",")
   142  		for _, file := range fileList {
   143  			if _, err := os.Stat(file); err != nil && os.IsNotExist(err) {
   144  				fmt.Fprintf(os.Stderr, "Data path(%v) does not exist.\n", file)
   145  				os.Exit(1)
   146  			}
   147  		}
   148  	}
   149  
   150  	if opt.ReduceShards > opt.MapShards {
   151  		fmt.Fprintf(os.Stderr, "Invalid flags: reduce_shards(%d) should be <= map_shards(%d)\n",
   152  			opt.ReduceShards, opt.MapShards)
   153  		os.Exit(1)
   154  	}
   155  	if opt.NumReducers > opt.ReduceShards {
   156  		fmt.Fprintf(os.Stderr, "Invalid flags: shufflers(%d) should be <= reduce_shards(%d)\n",
   157  			opt.NumReducers, opt.ReduceShards)
   158  		os.Exit(1)
   159  	}
   160  	if opt.CustomTokenizers != "" {
   161  		for _, soFile := range strings.Split(opt.CustomTokenizers, ",") {
   162  			tok.LoadCustomTokenizer(soFile)
   163  		}
   164  	}
   165  
   166  	opt.MapBufSize <<= 20 // Convert from MB to B.
   167  
   168  	optBuf, err := json.MarshalIndent(&opt, "", "\t")
   169  	x.Check(err)
   170  	fmt.Println(string(optBuf))
   171  
   172  	maxOpenFilesWarning()
   173  
   174  	go func() {
   175  		log.Fatal(http.ListenAndServe(opt.HttpAddr, nil))
   176  	}()
   177  
   178  	// Make sure it's OK to create or replace the directory specified with the --out option.
   179  	// It is always OK to create or replace the default output directory.
   180  	if opt.OutDir != defaultOutDir && !opt.ReplaceOutDir {
   181  		missingOrEmpty, err := x.IsMissingOrEmptyDir(opt.OutDir)
   182  		x.CheckfNoTrace(err)
   183  		if !missingOrEmpty {
   184  			fmt.Fprintf(os.Stderr, "Output directory exists and is not empty."+
   185  				" Use --replace_out to overwrite it.\n")
   186  			os.Exit(1)
   187  		}
   188  	}
   189  
   190  	// Delete and recreate the output dirs to ensure they are empty.
   191  	x.Check(os.RemoveAll(opt.OutDir))
   192  	for i := 0; i < opt.ReduceShards; i++ {
   193  		dir := filepath.Join(opt.OutDir, strconv.Itoa(i), "p")
   194  		x.Check(os.MkdirAll(dir, 0700))
   195  		opt.shardOutputDirs = append(opt.shardOutputDirs, dir)
   196  	}
   197  
   198  	// Create a directory just for bulk loader's usage.
   199  	if !opt.SkipMapPhase {
   200  		x.Check(os.RemoveAll(opt.TmpDir))
   201  		x.Check(os.MkdirAll(opt.TmpDir, 0700))
   202  	}
   203  	if opt.CleanupTmp {
   204  		defer os.RemoveAll(opt.TmpDir)
   205  	}
   206  
   207  	// Bulk loader can take up a lot of RAM. So, run GC often.
   208  	go func() {
   209  		ticker := time.NewTicker(10 * time.Second)
   210  		defer ticker.Stop()
   211  
   212  		var lastNum uint32
   213  		var ms runtime.MemStats
   214  		for range ticker.C {
   215  			runtime.ReadMemStats(&ms)
   216  			fmt.Printf("GC: %d. InUse: %s. Idle: %s\n", ms.NumGC, humanize.Bytes(ms.HeapInuse),
   217  				humanize.Bytes(ms.HeapIdle-ms.HeapReleased))
   218  			if ms.NumGC > lastNum {
   219  				// GC was already run by the Go runtime. No need to run it again.
   220  				lastNum = ms.NumGC
   221  			} else {
   222  				runtime.GC()
   223  				lastNum = ms.NumGC + 1
   224  			}
   225  		}
   226  	}()
   227  
   228  	loader := newLoader(opt)
   229  	if !opt.SkipMapPhase {
   230  		loader.mapStage()
   231  		mergeMapShardsIntoReduceShards(opt)
   232  	}
   233  	loader.reduceStage()
   234  	loader.writeSchema()
   235  	loader.cleanup()
   236  }
   237  
   238  func maxOpenFilesWarning() {
   239  	const (
   240  		red    = "\x1b[31m"
   241  		green  = "\x1b[32m"
   242  		yellow = "\x1b[33m"
   243  		reset  = "\x1b[0m"
   244  	)
   245  	maxOpenFiles, err := queryMaxOpenFiles()
   246  	if err != nil || maxOpenFiles < 1e6 {
   247  		fmt.Println(green + "\nThe bulk loader needs to open many files at once. This number depends" +
   248  			" on the size of the data set loaded, the map file output size, and the level" +
   249  			" of indexing. 100,000 is adequate for most data set sizes. See `man ulimit` for" +
   250  			" details of how to change the limit.")
   251  		if err != nil {
   252  			fmt.Printf(red+"Nonfatal error: max open file limit could not be detected: %v\n"+reset, err)
   253  		} else {
   254  			fmt.Printf(yellow+"Current max open files limit: %d\n"+reset, maxOpenFiles)
   255  		}
   256  		fmt.Println()
   257  	}
   258  }