github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/dgraph/cmd/bulk/run.go (about) 1 /* 2 * Copyright 2017-2018 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package bulk 18 19 import ( 20 "encoding/json" 21 "fmt" 22 "log" 23 "math" 24 "net/http" 25 _ "net/http/pprof" // http profiler 26 "os" 27 "path/filepath" 28 "runtime" 29 "strconv" 30 "strings" 31 "time" 32 33 "github.com/dgraph-io/dgraph/tok" 34 "github.com/dgraph-io/dgraph/x" 35 "github.com/dustin/go-humanize" 36 "github.com/spf13/cobra" 37 ) 38 39 // Bulk is the sub-command invoked when running "dgraph bulk". 40 var Bulk x.SubCommand 41 42 var defaultOutDir = "./out" 43 44 func init() { 45 Bulk.Cmd = &cobra.Command{ 46 Use: "bulk", 47 Short: "Run Dgraph bulk loader", 48 Run: func(cmd *cobra.Command, args []string) { 49 defer x.StartProfile(Bulk.Conf).Stop() 50 run() 51 }, 52 } 53 Bulk.EnvPrefix = "DGRAPH_BULK" 54 55 flag := Bulk.Cmd.Flags() 56 flag.StringP("files", "f", "", 57 "Location of *.rdf(.gz) or *.json(.gz) file(s) to load.") 58 flag.StringP("schema", "s", "", 59 "Location of schema file.") 60 flag.String("format", "", 61 "Specify file format (rdf or json) instead of getting it from filename.") 62 flag.String("out", defaultOutDir, 63 "Location to write the final dgraph data directories.") 64 flag.Bool("replace_out", false, 65 "Replace out directory and its contents if it exists.") 66 flag.String("tmp", "tmp", 67 "Temp directory used to use for on-disk scratch space. Requires free space proportional"+ 68 " to the size of the RDF file and the amount of indexing used.") 69 flag.IntP("num_go_routines", "j", int(math.Ceil(float64(runtime.NumCPU())/4.0)), 70 "Number of worker threads to use. MORE THREADS LEAD TO HIGHER RAM USAGE.") 71 flag.Int64("mapoutput_mb", 64, 72 "The estimated size of each map file output. Increasing this increases memory usage.") 73 flag.Bool("skip_map_phase", false, 74 "Skip the map phase (assumes that map output files already exist).") 75 flag.Bool("cleanup_tmp", true, 76 "Clean up the tmp directory after the loader finishes. Setting this to false allows the"+ 77 " bulk loader can be re-run while skipping the map phase.") 78 flag.Int("reducers", 1, 79 "Number of reducers to run concurrently. Increasing this can improve performance, and "+ 80 "must be less than or equal to the number of reduce shards.") 81 flag.Bool("version", false, "Prints the version of Dgraph Bulk Loader.") 82 flag.BoolP("store_xids", "x", false, "Generate an xid edge for each node.") 83 flag.StringP("zero", "z", "localhost:5080", "gRPC address for Dgraph zero") 84 // TODO: Potentially move http server to main. 85 flag.String("http", "localhost:8080", 86 "Address to serve http (pprof).") 87 flag.Bool("ignore_errors", false, "ignore line parsing errors in rdf files") 88 flag.Int("map_shards", 1, 89 "Number of map output shards. Must be greater than or equal to the number of reduce "+ 90 "shards. Increasing allows more evenly sized reduce shards, at the expense of "+ 91 "increased memory usage.") 92 flag.Int("reduce_shards", 1, 93 "Number of reduce shards. This determines the number of dgraph instances in the final "+ 94 "cluster. Increasing this potentially decreases the reduce stage runtime by using "+ 95 "more parallelism, but increases memory usage.") 96 flag.String("custom_tokenizers", "", 97 "Comma separated list of tokenizer plugins") 98 flag.Bool("new_uids", false, 99 "Ignore UIDs in load files and assign new ones.") 100 } 101 102 func run() { 103 opt := options{ 104 DataFiles: Bulk.Conf.GetString("files"), 105 DataFormat: Bulk.Conf.GetString("format"), 106 SchemaFile: Bulk.Conf.GetString("schema"), 107 OutDir: Bulk.Conf.GetString("out"), 108 ReplaceOutDir: Bulk.Conf.GetBool("replace_out"), 109 TmpDir: Bulk.Conf.GetString("tmp"), 110 NumGoroutines: Bulk.Conf.GetInt("num_go_routines"), 111 MapBufSize: uint64(Bulk.Conf.GetInt("mapoutput_mb")), 112 SkipMapPhase: Bulk.Conf.GetBool("skip_map_phase"), 113 CleanupTmp: Bulk.Conf.GetBool("cleanup_tmp"), 114 NumReducers: Bulk.Conf.GetInt("reducers"), 115 Version: Bulk.Conf.GetBool("version"), 116 StoreXids: Bulk.Conf.GetBool("store_xids"), 117 ZeroAddr: Bulk.Conf.GetString("zero"), 118 HttpAddr: Bulk.Conf.GetString("http"), 119 IgnoreErrors: Bulk.Conf.GetBool("ignore_errors"), 120 MapShards: Bulk.Conf.GetInt("map_shards"), 121 ReduceShards: Bulk.Conf.GetInt("reduce_shards"), 122 CustomTokenizers: Bulk.Conf.GetString("custom_tokenizers"), 123 NewUids: Bulk.Conf.GetBool("new_uids"), 124 } 125 126 x.PrintVersion() 127 if opt.Version { 128 os.Exit(0) 129 } 130 if opt.SchemaFile == "" { 131 fmt.Fprint(os.Stderr, "Schema file must be specified.\n") 132 os.Exit(1) 133 } else if _, err := os.Stat(opt.SchemaFile); err != nil && os.IsNotExist(err) { 134 fmt.Fprintf(os.Stderr, "Schema path(%v) does not exist.\n", opt.SchemaFile) 135 os.Exit(1) 136 } 137 if opt.DataFiles == "" { 138 fmt.Fprint(os.Stderr, "RDF or JSON file(s) location must be specified.\n") 139 os.Exit(1) 140 } else { 141 fileList := strings.Split(opt.DataFiles, ",") 142 for _, file := range fileList { 143 if _, err := os.Stat(file); err != nil && os.IsNotExist(err) { 144 fmt.Fprintf(os.Stderr, "Data path(%v) does not exist.\n", file) 145 os.Exit(1) 146 } 147 } 148 } 149 150 if opt.ReduceShards > opt.MapShards { 151 fmt.Fprintf(os.Stderr, "Invalid flags: reduce_shards(%d) should be <= map_shards(%d)\n", 152 opt.ReduceShards, opt.MapShards) 153 os.Exit(1) 154 } 155 if opt.NumReducers > opt.ReduceShards { 156 fmt.Fprintf(os.Stderr, "Invalid flags: shufflers(%d) should be <= reduce_shards(%d)\n", 157 opt.NumReducers, opt.ReduceShards) 158 os.Exit(1) 159 } 160 if opt.CustomTokenizers != "" { 161 for _, soFile := range strings.Split(opt.CustomTokenizers, ",") { 162 tok.LoadCustomTokenizer(soFile) 163 } 164 } 165 166 opt.MapBufSize <<= 20 // Convert from MB to B. 167 168 optBuf, err := json.MarshalIndent(&opt, "", "\t") 169 x.Check(err) 170 fmt.Println(string(optBuf)) 171 172 maxOpenFilesWarning() 173 174 go func() { 175 log.Fatal(http.ListenAndServe(opt.HttpAddr, nil)) 176 }() 177 178 // Make sure it's OK to create or replace the directory specified with the --out option. 179 // It is always OK to create or replace the default output directory. 180 if opt.OutDir != defaultOutDir && !opt.ReplaceOutDir { 181 missingOrEmpty, err := x.IsMissingOrEmptyDir(opt.OutDir) 182 x.CheckfNoTrace(err) 183 if !missingOrEmpty { 184 fmt.Fprintf(os.Stderr, "Output directory exists and is not empty."+ 185 " Use --replace_out to overwrite it.\n") 186 os.Exit(1) 187 } 188 } 189 190 // Delete and recreate the output dirs to ensure they are empty. 191 x.Check(os.RemoveAll(opt.OutDir)) 192 for i := 0; i < opt.ReduceShards; i++ { 193 dir := filepath.Join(opt.OutDir, strconv.Itoa(i), "p") 194 x.Check(os.MkdirAll(dir, 0700)) 195 opt.shardOutputDirs = append(opt.shardOutputDirs, dir) 196 } 197 198 // Create a directory just for bulk loader's usage. 199 if !opt.SkipMapPhase { 200 x.Check(os.RemoveAll(opt.TmpDir)) 201 x.Check(os.MkdirAll(opt.TmpDir, 0700)) 202 } 203 if opt.CleanupTmp { 204 defer os.RemoveAll(opt.TmpDir) 205 } 206 207 // Bulk loader can take up a lot of RAM. So, run GC often. 208 go func() { 209 ticker := time.NewTicker(10 * time.Second) 210 defer ticker.Stop() 211 212 var lastNum uint32 213 var ms runtime.MemStats 214 for range ticker.C { 215 runtime.ReadMemStats(&ms) 216 fmt.Printf("GC: %d. InUse: %s. Idle: %s\n", ms.NumGC, humanize.Bytes(ms.HeapInuse), 217 humanize.Bytes(ms.HeapIdle-ms.HeapReleased)) 218 if ms.NumGC > lastNum { 219 // GC was already run by the Go runtime. No need to run it again. 220 lastNum = ms.NumGC 221 } else { 222 runtime.GC() 223 lastNum = ms.NumGC + 1 224 } 225 } 226 }() 227 228 loader := newLoader(opt) 229 if !opt.SkipMapPhase { 230 loader.mapStage() 231 mergeMapShardsIntoReduceShards(opt) 232 } 233 loader.reduceStage() 234 loader.writeSchema() 235 loader.cleanup() 236 } 237 238 func maxOpenFilesWarning() { 239 const ( 240 red = "\x1b[31m" 241 green = "\x1b[32m" 242 yellow = "\x1b[33m" 243 reset = "\x1b[0m" 244 ) 245 maxOpenFiles, err := queryMaxOpenFiles() 246 if err != nil || maxOpenFiles < 1e6 { 247 fmt.Println(green + "\nThe bulk loader needs to open many files at once. This number depends" + 248 " on the size of the data set loaded, the map file output size, and the level" + 249 " of indexing. 100,000 is adequate for most data set sizes. See `man ulimit` for" + 250 " details of how to change the limit.") 251 if err != nil { 252 fmt.Printf(red+"Nonfatal error: max open file limit could not be detected: %v\n"+reset, err) 253 } else { 254 fmt.Printf(yellow+"Current max open files limit: %d\n"+reset, maxOpenFiles) 255 } 256 fmt.Println() 257 } 258 }