github.com/dshekhar95/sub_dgraph@v0.0.0-20230424164411-6be28e40bbf1/dgraph/cmd/bulk/run.go (about) 1 /* 2 * Copyright 2017-2022 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package bulk 18 19 import ( 20 "encoding/json" 21 "fmt" 22 "log" 23 "math" 24 "net/http" 25 _ "net/http/pprof" // http profiler 26 "os" 27 "path/filepath" 28 "runtime" 29 "strconv" 30 "strings" 31 32 "github.com/spf13/cobra" 33 34 "github.com/dgraph-io/badger/v3" 35 "github.com/dgraph-io/dgraph/ee" 36 "github.com/dgraph-io/dgraph/filestore" 37 "github.com/dgraph-io/dgraph/protos/pb" 38 "github.com/dgraph-io/dgraph/tok" 39 "github.com/dgraph-io/dgraph/worker" 40 "github.com/dgraph-io/dgraph/x" 41 "github.com/dgraph-io/ristretto/z" 42 ) 43 44 // Bulk is the sub-command invoked when running "dgraph bulk". 45 var Bulk x.SubCommand 46 47 var defaultOutDir = "./out" 48 49 const BulkBadgerDefaults = "compression=snappy; numgoroutines=8;" 50 51 func init() { 52 Bulk.Cmd = &cobra.Command{ 53 Use: "bulk", 54 Short: "Run Dgraph Bulk Loader", 55 Run: func(cmd *cobra.Command, args []string) { 56 defer x.StartProfile(Bulk.Conf).Stop() 57 run() 58 }, 59 Annotations: map[string]string{"group": "data-load"}, 60 } 61 Bulk.Cmd.SetHelpTemplate(x.NonRootTemplate) 62 Bulk.EnvPrefix = "DGRAPH_BULK" 63 64 flag := Bulk.Cmd.Flags() 65 flag.StringP("files", "f", "", 66 "Location of *.rdf(.gz) or *.json(.gz) file(s) to load.") 67 flag.StringP("schema", "s", "", 68 "Location of schema file.") 69 flag.StringP("graphql_schema", "g", "", "Location of the GraphQL schema file.") 70 flag.String("format", "", 71 "Specify file format (rdf or json) instead of getting it from filename.") 72 flag.Bool("encrypted", false, 73 "Flag to indicate whether schema and data files are encrypted. "+ 74 "Must be specified with --encryption or vault option(s).") 75 flag.Bool("encrypted_out", false, 76 "Flag to indicate whether to encrypt the output. "+ 77 "Must be specified with --encryption or vault option(s).") 78 flag.String("out", defaultOutDir, 79 "Location to write the final dgraph data directories.") 80 flag.Bool("replace_out", false, 81 "Replace out directory and its contents if it exists.") 82 flag.String("tmp", "tmp", 83 "Temp directory used to use for on-disk scratch space. Requires free space proportional"+ 84 " to the size of the RDF file and the amount of indexing used.") 85 86 flag.IntP("num_go_routines", "j", int(math.Ceil(float64(runtime.NumCPU())/4.0)), 87 "Number of worker threads to use. MORE THREADS LEAD TO HIGHER RAM USAGE.") 88 flag.Int64("mapoutput_mb", 2048, 89 "The estimated size of each map file output. Increasing this increases memory usage.") 90 flag.Int64("partition_mb", 4, "Pick a partition key every N megabytes of data.") 91 flag.Bool("skip_map_phase", false, 92 "Skip the map phase (assumes that map output files already exist).") 93 flag.Bool("cleanup_tmp", true, 94 "Clean up the tmp directory after the loader finishes. Setting this to false allows the"+ 95 " bulk loader can be re-run while skipping the map phase.") 96 flag.Int("reducers", 1, 97 "Number of reducers to run concurrently. Increasing this can improve performance, and "+ 98 "must be less than or equal to the number of reduce shards.") 99 flag.Bool("version", false, "Prints the version of Dgraph Bulk Loader.") 100 flag.Bool("store_xids", false, "Generate an xid edge for each node.") 101 flag.StringP("zero", "z", "localhost:5080", "gRPC address for Dgraph zero") 102 flag.String("xidmap", "", "Directory to store xid to uid mapping") 103 // TODO: Potentially move http server to main. 104 flag.String("http", "localhost:8080", 105 "Address to serve http (pprof).") 106 flag.Bool("ignore_errors", false, "ignore line parsing errors in rdf files") 107 flag.Int("map_shards", 1, 108 "Number of map output shards. Must be greater than or equal to the number of reduce "+ 109 "shards. Increasing allows more evenly sized reduce shards, at the expense of "+ 110 "increased memory usage.") 111 flag.Int("reduce_shards", 1, 112 "Number of reduce shards. This determines the number of dgraph instances in the final "+ 113 "cluster. Increasing this potentially decreases the reduce stage runtime by using "+ 114 "more parallelism, but increases memory usage.") 115 flag.String("custom_tokenizers", "", 116 "Comma separated list of tokenizer plugins") 117 flag.Bool("new_uids", false, 118 "Ignore UIDs in load files and assign new ones.") 119 flag.Uint64("force-namespace", math.MaxUint64, 120 "Namespace onto which to load the data. If not set, will preserve the namespace."+ 121 " When using this flag to load data into specific namespace, make sure that the "+ 122 "load data do not have ACL data.") 123 124 flag.String("badger", BulkBadgerDefaults, z.NewSuperFlagHelp(BulkBadgerDefaults). 125 Head("Badger options (Refer to badger documentation for all possible options)"). 126 Flag("compression", 127 "Specifies the compression algorithm and compression level (if applicable) for the "+ 128 `postings directory. "none" would disable compression, while "zstd:1" would set `+ 129 "zstd compression at level 1."). 130 Flag("numgoroutines", 131 "The number of goroutines to use in badger.Stream."). 132 String()) 133 134 x.RegisterClientTLSFlags(flag) 135 // Encryption and Vault options 136 ee.RegisterEncFlag(flag) 137 } 138 139 func run() { 140 cacheSize := 64 << 20 // These are the default values. User can overwrite them using --badger. 141 cacheDefaults := fmt.Sprintf("indexcachesize=%d; blockcachesize=%d; ", 142 (70*cacheSize)/100, (30*cacheSize)/100) 143 144 bopts := badger.DefaultOptions("").FromSuperFlag(BulkBadgerDefaults + cacheDefaults). 145 FromSuperFlag(Bulk.Conf.GetString("badger")) 146 keys, err := ee.GetKeys(Bulk.Conf) 147 x.Check(err) 148 149 opt := options{ 150 DataFiles: Bulk.Conf.GetString("files"), 151 DataFormat: Bulk.Conf.GetString("format"), 152 EncryptionKey: keys.EncKey, 153 SchemaFile: Bulk.Conf.GetString("schema"), 154 GqlSchemaFile: Bulk.Conf.GetString("graphql_schema"), 155 Encrypted: Bulk.Conf.GetBool("encrypted"), 156 EncryptedOut: Bulk.Conf.GetBool("encrypted_out"), 157 OutDir: Bulk.Conf.GetString("out"), 158 ReplaceOutDir: Bulk.Conf.GetBool("replace_out"), 159 TmpDir: Bulk.Conf.GetString("tmp"), 160 NumGoroutines: Bulk.Conf.GetInt("num_go_routines"), 161 MapBufSize: uint64(Bulk.Conf.GetInt("mapoutput_mb")), 162 PartitionBufSize: int64(Bulk.Conf.GetInt("partition_mb")), 163 SkipMapPhase: Bulk.Conf.GetBool("skip_map_phase"), 164 CleanupTmp: Bulk.Conf.GetBool("cleanup_tmp"), 165 NumReducers: Bulk.Conf.GetInt("reducers"), 166 Version: Bulk.Conf.GetBool("version"), 167 StoreXids: Bulk.Conf.GetBool("store_xids"), 168 ZeroAddr: Bulk.Conf.GetString("zero"), 169 HttpAddr: Bulk.Conf.GetString("http"), 170 IgnoreErrors: Bulk.Conf.GetBool("ignore_errors"), 171 MapShards: Bulk.Conf.GetInt("map_shards"), 172 ReduceShards: Bulk.Conf.GetInt("reduce_shards"), 173 CustomTokenizers: Bulk.Conf.GetString("custom_tokenizers"), 174 NewUids: Bulk.Conf.GetBool("new_uids"), 175 ClientDir: Bulk.Conf.GetString("xidmap"), 176 Namespace: Bulk.Conf.GetUint64("force-namespace"), 177 Badger: bopts, 178 } 179 180 x.PrintVersion() 181 if opt.Version { 182 os.Exit(0) 183 } 184 185 if len(opt.EncryptionKey) == 0 { 186 if opt.Encrypted || opt.EncryptedOut { 187 fmt.Fprint(os.Stderr, "Must use --encryption or vault option(s).\n") 188 os.Exit(1) 189 } 190 } else { 191 requiredFlags := Bulk.Cmd.Flags().Changed("encrypted") && 192 Bulk.Cmd.Flags().Changed("encrypted_out") 193 if !requiredFlags { 194 fmt.Fprint(os.Stderr, 195 "Must specify --encrypted and --encrypted_out when providing encryption key.\n") 196 os.Exit(1) 197 } 198 if !opt.Encrypted && !opt.EncryptedOut { 199 fmt.Fprint(os.Stderr, 200 "Must set --encrypted and/or --encrypted_out to true when providing encryption key.\n") 201 os.Exit(1) 202 } 203 204 tlsConf, err := x.LoadClientTLSConfigForInternalPort(Bulk.Conf) 205 x.Check(err) 206 // Need to set zero addr in WorkerConfig before checking the license. 207 x.WorkerConfig.ZeroAddr = []string{opt.ZeroAddr} 208 x.WorkerConfig.TLSClientConfig = tlsConf 209 if !worker.EnterpriseEnabled() { 210 // Crash since the enterprise license is not enabled.. 211 log.Fatal("Enterprise License needed for the Encryption feature.") 212 } else { 213 log.Printf("Encryption feature enabled.") 214 } 215 } 216 fmt.Printf("Encrypted input: %v; Encrypted output: %v\n", opt.Encrypted, opt.EncryptedOut) 217 218 if opt.SchemaFile == "" { 219 fmt.Fprint(os.Stderr, "Schema file must be specified.\n") 220 os.Exit(1) 221 } 222 if !filestore.Exists(opt.SchemaFile) { 223 fmt.Fprintf(os.Stderr, "Schema path(%v) does not exist.\n", opt.SchemaFile) 224 os.Exit(1) 225 } 226 if opt.DataFiles == "" { 227 fmt.Fprint(os.Stderr, "RDF or JSON file(s) location must be specified.\n") 228 os.Exit(1) 229 } else { 230 fileList := strings.Split(opt.DataFiles, ",") 231 for _, file := range fileList { 232 if !filestore.Exists(file) { 233 fmt.Fprintf(os.Stderr, "Data path(%v) does not exist.\n", file) 234 os.Exit(1) 235 } 236 } 237 } 238 239 if opt.ReduceShards > opt.MapShards { 240 fmt.Fprintf(os.Stderr, "Invalid flags: reduce_shards(%d) should be <= map_shards(%d)\n", 241 opt.ReduceShards, opt.MapShards) 242 os.Exit(1) 243 } 244 if opt.NumReducers > opt.ReduceShards { 245 fmt.Fprintf(os.Stderr, "Invalid flags: shufflers(%d) should be <= reduce_shards(%d)\n", 246 opt.NumReducers, opt.ReduceShards) 247 os.Exit(1) 248 } 249 if opt.CustomTokenizers != "" { 250 for _, soFile := range strings.Split(opt.CustomTokenizers, ",") { 251 tok.LoadCustomTokenizer(soFile) 252 } 253 } 254 if opt.MapBufSize <= 0 || opt.PartitionBufSize <= 0 { 255 fmt.Fprintf(os.Stderr, "mapoutput_mb: %d and partition_mb: %d must be greater than zero\n", 256 opt.MapBufSize, opt.PartitionBufSize) 257 os.Exit(1) 258 } 259 260 opt.MapBufSize <<= 20 // Convert from MB to B. 261 opt.PartitionBufSize <<= 20 // Convert from MB to B. 262 263 optBuf, err := json.MarshalIndent(&opt, "", "\t") 264 x.Check(err) 265 fmt.Println(string(optBuf)) 266 267 maxOpenFilesWarning() 268 269 go func() { 270 log.Fatal(http.ListenAndServe(opt.HttpAddr, nil)) 271 }() 272 http.HandleFunc("/jemalloc", x.JemallocHandler) 273 274 // Make sure it's OK to create or replace the directory specified with the --out option. 275 // It is always OK to create or replace the default output directory. 276 if opt.OutDir != defaultOutDir && !opt.ReplaceOutDir { 277 err := x.IsMissingOrEmptyDir(opt.OutDir) 278 if err == nil { 279 fmt.Fprintf(os.Stderr, "Output directory exists and is not empty."+ 280 " Use --replace_out to overwrite it.\n") 281 os.Exit(1) 282 } else if err != x.ErrMissingDir { 283 x.CheckfNoTrace(err) 284 } 285 } 286 287 // Delete and recreate the output dirs to ensure they are empty. 288 x.Check(os.RemoveAll(opt.OutDir)) 289 for i := 0; i < opt.ReduceShards; i++ { 290 dir := filepath.Join(opt.OutDir, strconv.Itoa(i), "p") 291 x.Check(os.MkdirAll(dir, 0700)) 292 opt.shardOutputDirs = append(opt.shardOutputDirs, dir) 293 294 x.Check(x.WriteGroupIdFile(dir, uint32(i+1))) 295 } 296 297 // Create a directory just for bulk loader's usage. 298 if !opt.SkipMapPhase { 299 x.Check(os.RemoveAll(opt.TmpDir)) 300 x.Check(os.MkdirAll(opt.TmpDir, 0700)) 301 } 302 if opt.CleanupTmp { 303 defer os.RemoveAll(opt.TmpDir) 304 } 305 306 // Create directory for temporary buffers used in map-reduce phase 307 bufDir := filepath.Join(opt.TmpDir, bufferDir) 308 x.Check(os.RemoveAll(bufDir)) 309 x.Check(os.MkdirAll(bufDir, 0700)) 310 defer os.RemoveAll(bufDir) 311 312 loader := newLoader(&opt) 313 314 const bulkMetaFilename = "bulk.meta" 315 bulkMetaPath := filepath.Join(opt.TmpDir, bulkMetaFilename) 316 317 if opt.SkipMapPhase { 318 bulkMetaData, err := os.ReadFile(bulkMetaPath) 319 if err != nil { 320 fmt.Fprintln(os.Stderr, "Error reading from bulk meta file") 321 os.Exit(1) 322 } 323 324 var bulkMeta pb.BulkMeta 325 if err = bulkMeta.Unmarshal(bulkMetaData); err != nil { 326 fmt.Fprintln(os.Stderr, "Error deserializing bulk meta file") 327 os.Exit(1) 328 } 329 330 loader.prog.mapEdgeCount = bulkMeta.EdgeCount 331 loader.schema.schemaMap = bulkMeta.SchemaMap 332 loader.schema.types = bulkMeta.Types 333 } else { 334 loader.mapStage() 335 mergeMapShardsIntoReduceShards(&opt) 336 loader.leaseNamespaces() 337 338 bulkMeta := pb.BulkMeta{ 339 EdgeCount: loader.prog.mapEdgeCount, 340 SchemaMap: loader.schema.schemaMap, 341 Types: loader.schema.types, 342 } 343 bulkMetaData, err := bulkMeta.Marshal() 344 if err != nil { 345 fmt.Fprintln(os.Stderr, "Error serializing bulk meta file") 346 os.Exit(1) 347 } 348 if err = os.WriteFile(bulkMetaPath, bulkMetaData, 0600); err != nil { 349 fmt.Fprintln(os.Stderr, "Error writing to bulk meta file") 350 os.Exit(1) 351 } 352 } 353 loader.reduceStage() 354 loader.writeSchema() 355 loader.cleanup() 356 } 357 358 func maxOpenFilesWarning() { 359 const ( 360 red = "\x1b[31m" 361 green = "\x1b[32m" 362 yellow = "\x1b[33m" 363 reset = "\x1b[0m" 364 ) 365 maxOpenFiles, err := x.QueryMaxOpenFiles() 366 if err != nil || maxOpenFiles < 1e6 { 367 fmt.Println(green + "\nThe bulk loader needs to open many files at once. This number depends" + 368 " on the size of the data set loaded, the map file output size, and the level" + 369 " of indexing. 100,000 is adequate for most data set sizes. See `man ulimit` for" + 370 " details of how to change the limit.") 371 if err != nil { 372 fmt.Printf(red+"Nonfatal error: max open file limit could not be detected: %v\n"+reset, err) 373 } else { 374 fmt.Printf(yellow+"Current max open files limit: %d\n"+reset, maxOpenFiles) 375 } 376 fmt.Println() 377 } 378 }