github.com/dshekhar95/sub_dgraph@v0.0.0-20230424164411-6be28e40bbf1/dgraph/cmd/bulk/loader.go (about) 1 /* 2 * Copyright 2017-2022 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package bulk 18 19 import ( 20 "bytes" 21 "compress/gzip" 22 "context" 23 "encoding/json" 24 "fmt" 25 "hash/adler32" 26 "io" 27 "log" 28 "math" 29 "os" 30 "path/filepath" 31 "strconv" 32 "sync" 33 "time" 34 35 "google.golang.org/grpc" 36 "google.golang.org/grpc/credentials" 37 "google.golang.org/grpc/credentials/insecure" 38 39 "github.com/dgraph-io/badger/v3" 40 "github.com/dgraph-io/badger/v3/y" 41 "github.com/dgraph-io/dgraph/chunker" 42 "github.com/dgraph-io/dgraph/ee/enc" 43 "github.com/dgraph-io/dgraph/filestore" 44 "github.com/dgraph-io/dgraph/protos/pb" 45 "github.com/dgraph-io/dgraph/schema" 46 "github.com/dgraph-io/dgraph/x" 47 "github.com/dgraph-io/dgraph/xidmap" 48 ) 49 50 type options struct { 51 DataFiles string 52 DataFormat string 53 SchemaFile string 54 GqlSchemaFile string 55 OutDir string 56 ReplaceOutDir bool 57 TmpDir string 58 NumGoroutines int 59 MapBufSize uint64 60 PartitionBufSize int64 61 SkipMapPhase bool 62 CleanupTmp bool 63 NumReducers int 64 Version bool 65 StoreXids bool 66 ZeroAddr string 67 HttpAddr string 68 IgnoreErrors bool 69 CustomTokenizers string 70 NewUids bool 71 ClientDir string 72 Encrypted bool 73 EncryptedOut bool 74 75 MapShards int 76 ReduceShards int 77 78 Namespace uint64 79 80 shardOutputDirs []string 81 82 // ........... Badger options .......... 83 // EncryptionKey is the key used for encryption. Enterprise only feature. 84 EncryptionKey x.Sensitive 85 // Badger options. 86 Badger badger.Options 87 } 88 89 type state struct { 90 opt *options 91 prog *progress 92 xids *xidmap.XidMap 93 schema *schemaStore 94 shards *shardMap 95 readerChunkCh chan *bytes.Buffer 96 mapFileId uint32 // Used atomically to name the output files of the mappers. 97 dbs []*badger.DB 98 tmpDbs []*badger.DB // Temporary DB to write the split lists to avoid ordering issues. 99 writeTs uint64 // All badger writes use this timestamp 100 namespaces *sync.Map // To store the encountered namespaces. 101 } 102 103 type loader struct { 104 *state 105 mappers []*mapper 106 zero *grpc.ClientConn 107 } 108 109 func newLoader(opt *options) *loader { 110 if opt == nil { 111 log.Fatalf("Cannot create loader with nil options.") 112 } 113 114 fmt.Printf("Connecting to zero at %s\n", opt.ZeroAddr) 115 ctx, cancel := context.WithTimeout(context.Background(), time.Minute) 116 defer cancel() 117 118 tlsConf, err := x.LoadClientTLSConfigForInternalPort(Bulk.Conf) 119 x.Check(err) 120 dialOpts := []grpc.DialOption{ 121 grpc.WithBlock(), 122 } 123 if tlsConf != nil { 124 dialOpts = append(dialOpts, grpc.WithTransportCredentials(credentials.NewTLS(tlsConf))) 125 } else { 126 dialOpts = append(dialOpts, grpc.WithTransportCredentials(insecure.NewCredentials())) 127 } 128 zero, err := grpc.DialContext(ctx, opt.ZeroAddr, dialOpts...) 129 x.Checkf(err, "Unable to connect to zero, Is it running at %s?", opt.ZeroAddr) 130 st := &state{ 131 opt: opt, 132 prog: newProgress(), 133 shards: newShardMap(opt.MapShards), 134 // Lots of gz readers, so not much channel buffer needed. 135 readerChunkCh: make(chan *bytes.Buffer, opt.NumGoroutines), 136 writeTs: getWriteTimestamp(zero), 137 namespaces: &sync.Map{}, 138 } 139 st.schema = newSchemaStore(readSchema(opt), opt, st) 140 ld := &loader{ 141 state: st, 142 mappers: make([]*mapper, opt.NumGoroutines), 143 zero: zero, 144 } 145 for i := 0; i < opt.NumGoroutines; i++ { 146 ld.mappers[i] = newMapper(st) 147 } 148 go ld.prog.report() 149 return ld 150 } 151 152 func getWriteTimestamp(zero *grpc.ClientConn) uint64 { 153 client := pb.NewZeroClient(zero) 154 for { 155 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 156 ts, err := client.Timestamps(ctx, &pb.Num{Val: 1}) 157 cancel() 158 if err == nil { 159 return ts.GetStartId() 160 } 161 fmt.Printf("Error communicating with dgraph zero, retrying: %v", err) 162 time.Sleep(time.Second) 163 } 164 } 165 166 // leaseNamespace is called at the end of map phase. It leases the namespace ids till the maximum 167 // seen namespace id. 168 func (ld *loader) leaseNamespaces() { 169 var maxNs uint64 170 ld.namespaces.Range(func(key, value interface{}) bool { 171 if ns := key.(uint64); ns > maxNs { 172 maxNs = ns 173 } 174 return true 175 }) 176 177 // If only the default namespace is seen, do nothing. 178 if maxNs == 0 { 179 return 180 } 181 182 client := pb.NewZeroClient(ld.zero) 183 for { 184 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 185 ns, err := client.AssignIds(ctx, &pb.Num{Val: maxNs, Type: pb.Num_NS_ID}) 186 cancel() 187 if err == nil { 188 fmt.Printf("Assigned namespaces till %d", ns.GetEndId()) 189 return 190 } 191 fmt.Printf("Error communicating with dgraph zero, retrying: %v", err) 192 time.Sleep(time.Second) 193 } 194 } 195 196 func readSchema(opt *options) *schema.ParsedSchema { 197 f, err := filestore.Open(opt.SchemaFile) 198 x.Check(err) 199 defer f.Close() 200 201 key := opt.EncryptionKey 202 if !opt.Encrypted { 203 key = nil 204 } 205 r, err := enc.GetReader(key, f) 206 x.Check(err) 207 if filepath.Ext(opt.SchemaFile) == ".gz" { 208 r, err = gzip.NewReader(r) 209 x.Check(err) 210 } 211 212 buf, err := io.ReadAll(r) 213 x.Check(err) 214 215 result, err := schema.ParseWithNamespace(string(buf), opt.Namespace) 216 x.Check(err) 217 return result 218 } 219 220 func (ld *loader) mapStage() { 221 ld.prog.setPhase(mapPhase) 222 var db *badger.DB 223 if len(ld.opt.ClientDir) > 0 { 224 x.Check(os.MkdirAll(ld.opt.ClientDir, 0700)) 225 226 var err error 227 db, err = badger.Open(badger.DefaultOptions(ld.opt.ClientDir)) 228 x.Checkf(err, "Error while creating badger KV posting store") 229 } 230 ld.xids = xidmap.New(xidmap.XidMapOptions{ 231 UidAssigner: ld.zero, 232 DB: db, 233 Dir: filepath.Join(ld.opt.TmpDir, bufferDir), 234 }) 235 236 fs := filestore.NewFileStore(ld.opt.DataFiles) 237 238 files := fs.FindDataFiles(ld.opt.DataFiles, []string{".rdf", ".rdf.gz", ".json", ".json.gz"}) 239 if len(files) == 0 { 240 fmt.Printf("No data files found in %s.\n", ld.opt.DataFiles) 241 os.Exit(1) 242 } 243 244 // Because mappers must handle chunks that may be from different input files, they must all 245 // assume the same data format, either RDF or JSON. Use the one specified by the user or by 246 // the first load file. 247 loadType := chunker.DataFormat(files[0], ld.opt.DataFormat) 248 if loadType == chunker.UnknownFormat { 249 // Dont't try to detect JSON input in bulk loader. 250 fmt.Printf("Need --format=rdf or --format=json to load %s", files[0]) 251 os.Exit(1) 252 } 253 254 var mapperWg sync.WaitGroup 255 mapperWg.Add(len(ld.mappers)) 256 for _, m := range ld.mappers { 257 go func(m *mapper) { 258 m.run(loadType) 259 mapperWg.Done() 260 }(m) 261 } 262 263 // This is the main map loop. 264 thr := y.NewThrottle(ld.opt.NumGoroutines) 265 for i, file := range files { 266 x.Check(thr.Do()) 267 fmt.Printf("Processing file (%d out of %d): %s\n", i+1, len(files), file) 268 269 go func(file string) { 270 defer thr.Done(nil) 271 272 key := ld.opt.EncryptionKey 273 if !ld.opt.Encrypted { 274 key = nil 275 } 276 r, cleanup := fs.ChunkReader(file, key) 277 defer cleanup() 278 279 chunk := chunker.NewChunker(loadType, 1000) 280 for { 281 chunkBuf, err := chunk.Chunk(r) 282 if chunkBuf != nil && chunkBuf.Len() > 0 { 283 ld.readerChunkCh <- chunkBuf 284 } 285 if err == io.EOF { 286 break 287 } else if err != nil { 288 x.Check(err) 289 } 290 } 291 }(file) 292 } 293 x.Check(thr.Finish()) 294 295 // Send the graphql triples 296 ld.processGqlSchema(loadType) 297 298 close(ld.readerChunkCh) 299 mapperWg.Wait() 300 301 // Allow memory to GC before the reduce phase. 302 for i := range ld.mappers { 303 ld.mappers[i] = nil 304 } 305 x.Check(ld.xids.Flush()) 306 if db != nil { 307 x.Check(db.Close()) 308 } 309 ld.xids = nil 310 } 311 312 func parseGqlSchema(s string) map[uint64]string { 313 var schemas []x.ExportedGQLSchema 314 if err := json.Unmarshal([]byte(s), &schemas); err != nil { 315 fmt.Println("Error while decoding the graphql schema. Assuming it to be in format < 21.03.") 316 return map[uint64]string{x.GalaxyNamespace: s} 317 } 318 319 schemaMap := make(map[uint64]string) 320 for _, schema := range schemas { 321 if _, ok := schemaMap[schema.Namespace]; ok { 322 fmt.Printf("Found multiple GraphQL schema for namespace %d.", schema.Namespace) 323 continue 324 } 325 schemaMap[schema.Namespace] = schema.Schema 326 } 327 return schemaMap 328 } 329 330 func (ld *loader) processGqlSchema(loadType chunker.InputFormat) { 331 if ld.opt.GqlSchemaFile == "" { 332 return 333 } 334 335 f, err := filestore.Open(ld.opt.GqlSchemaFile) 336 x.Check(err) 337 defer f.Close() 338 339 key := ld.opt.EncryptionKey 340 if !ld.opt.Encrypted { 341 key = nil 342 } 343 r, err := enc.GetReader(key, f) 344 x.Check(err) 345 if filepath.Ext(ld.opt.GqlSchemaFile) == ".gz" { 346 r, err = gzip.NewReader(r) 347 x.Check(err) 348 } 349 350 buf, err := io.ReadAll(r) 351 x.Check(err) 352 353 rdfSchema := `_:gqlschema <dgraph.type> "dgraph.graphql" <%#x> . 354 _:gqlschema <dgraph.graphql.xid> "dgraph.graphql.schema" <%#x> . 355 _:gqlschema <dgraph.graphql.schema> %s <%#x> . 356 ` 357 358 jsonSchema := `{ 359 "namespace": "%#x", 360 "dgraph.type": "dgraph.graphql", 361 "dgraph.graphql.xid": "dgraph.graphql.schema", 362 "dgraph.graphql.schema": %s 363 }` 364 365 process := func(ns uint64, schema string) { 366 // Ignore the schema if the namespace is not already seen. 367 if _, ok := ld.schema.namespaces.Load(ns); !ok { 368 fmt.Printf("No data exist for namespace: %d. Cannot load the graphql schema.", ns) 369 return 370 } 371 gqlBuf := &bytes.Buffer{} 372 schema = strconv.Quote(schema) 373 switch loadType { 374 case chunker.RdfFormat: 375 x.Check2(gqlBuf.Write([]byte(fmt.Sprintf(rdfSchema, ns, ns, schema, ns)))) 376 case chunker.JsonFormat: 377 x.Check2(gqlBuf.Write([]byte(fmt.Sprintf(jsonSchema, ns, schema)))) 378 } 379 ld.readerChunkCh <- gqlBuf 380 } 381 382 schemas := parseGqlSchema(string(buf)) 383 if ld.opt.Namespace == math.MaxUint64 { 384 // Preserve the namespace. 385 for ns, schema := range schemas { 386 process(ns, schema) 387 } 388 return 389 } 390 391 switch len(schemas) { 392 case 1: 393 // User might have exported from a different namespace. So, schema.Namespace will not be 394 // having the correct value. 395 for _, schema := range schemas { 396 process(ld.opt.Namespace, schema) 397 } 398 default: 399 if _, ok := schemas[ld.opt.Namespace]; !ok { 400 // We expect only a single GraphQL schema when loading into specfic namespace. 401 fmt.Printf("Didn't find GraphQL schema for namespace %d. Not loading GraphQL schema.", 402 ld.opt.Namespace) 403 return 404 } 405 process(ld.opt.Namespace, schemas[ld.opt.Namespace]) 406 } 407 } 408 409 func (ld *loader) reduceStage() { 410 ld.prog.setPhase(reducePhase) 411 412 r := reducer{ 413 state: ld.state, 414 streamIds: make(map[string]uint32), 415 } 416 x.Check(r.run()) 417 } 418 419 func (ld *loader) writeSchema() { 420 numDBs := uint32(len(ld.dbs)) 421 preds := make([][]string, numDBs) 422 423 // Get all predicates that have data in some DB. 424 m := make(map[string]struct{}) 425 for i, db := range ld.dbs { 426 preds[i] = ld.schema.getPredicates(db) 427 for _, p := range preds[i] { 428 m[p] = struct{}{} 429 } 430 } 431 432 // Find any predicates that don't have data in any DB 433 // and distribute them among all the DBs. 434 for p := range ld.schema.schemaMap { 435 if _, ok := m[p]; !ok { 436 i := adler32.Checksum([]byte(p)) % numDBs 437 preds[i] = append(preds[i], p) 438 } 439 } 440 441 // Write out each DB's final predicate list. 442 for i, db := range ld.dbs { 443 ld.schema.write(db, preds[i]) 444 } 445 } 446 447 func (ld *loader) cleanup() { 448 for _, db := range ld.dbs { 449 x.Check(db.Close()) 450 } 451 for _, db := range ld.tmpDbs { 452 opts := db.Opts() 453 x.Check(db.Close()) 454 x.Check(os.RemoveAll(opts.Dir)) 455 } 456 ld.prog.endSummary() 457 }