github.com/dshekhar95/sub_dgraph@v0.0.0-20230424164411-6be28e40bbf1/dgraph/cmd/bulk/loader.go (about)

     1  /*
     2   * Copyright 2017-2022 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package bulk
    18  
    19  import (
    20  	"bytes"
    21  	"compress/gzip"
    22  	"context"
    23  	"encoding/json"
    24  	"fmt"
    25  	"hash/adler32"
    26  	"io"
    27  	"log"
    28  	"math"
    29  	"os"
    30  	"path/filepath"
    31  	"strconv"
    32  	"sync"
    33  	"time"
    34  
    35  	"google.golang.org/grpc"
    36  	"google.golang.org/grpc/credentials"
    37  	"google.golang.org/grpc/credentials/insecure"
    38  
    39  	"github.com/dgraph-io/badger/v3"
    40  	"github.com/dgraph-io/badger/v3/y"
    41  	"github.com/dgraph-io/dgraph/chunker"
    42  	"github.com/dgraph-io/dgraph/ee/enc"
    43  	"github.com/dgraph-io/dgraph/filestore"
    44  	"github.com/dgraph-io/dgraph/protos/pb"
    45  	"github.com/dgraph-io/dgraph/schema"
    46  	"github.com/dgraph-io/dgraph/x"
    47  	"github.com/dgraph-io/dgraph/xidmap"
    48  )
    49  
    50  type options struct {
    51  	DataFiles        string
    52  	DataFormat       string
    53  	SchemaFile       string
    54  	GqlSchemaFile    string
    55  	OutDir           string
    56  	ReplaceOutDir    bool
    57  	TmpDir           string
    58  	NumGoroutines    int
    59  	MapBufSize       uint64
    60  	PartitionBufSize int64
    61  	SkipMapPhase     bool
    62  	CleanupTmp       bool
    63  	NumReducers      int
    64  	Version          bool
    65  	StoreXids        bool
    66  	ZeroAddr         string
    67  	HttpAddr         string
    68  	IgnoreErrors     bool
    69  	CustomTokenizers string
    70  	NewUids          bool
    71  	ClientDir        string
    72  	Encrypted        bool
    73  	EncryptedOut     bool
    74  
    75  	MapShards    int
    76  	ReduceShards int
    77  
    78  	Namespace uint64
    79  
    80  	shardOutputDirs []string
    81  
    82  	// ........... Badger options ..........
    83  	// EncryptionKey is the key used for encryption. Enterprise only feature.
    84  	EncryptionKey x.Sensitive
    85  	// Badger options.
    86  	Badger badger.Options
    87  }
    88  
    89  type state struct {
    90  	opt           *options
    91  	prog          *progress
    92  	xids          *xidmap.XidMap
    93  	schema        *schemaStore
    94  	shards        *shardMap
    95  	readerChunkCh chan *bytes.Buffer
    96  	mapFileId     uint32 // Used atomically to name the output files of the mappers.
    97  	dbs           []*badger.DB
    98  	tmpDbs        []*badger.DB // Temporary DB to write the split lists to avoid ordering issues.
    99  	writeTs       uint64       // All badger writes use this timestamp
   100  	namespaces    *sync.Map    // To store the encountered namespaces.
   101  }
   102  
   103  type loader struct {
   104  	*state
   105  	mappers []*mapper
   106  	zero    *grpc.ClientConn
   107  }
   108  
   109  func newLoader(opt *options) *loader {
   110  	if opt == nil {
   111  		log.Fatalf("Cannot create loader with nil options.")
   112  	}
   113  
   114  	fmt.Printf("Connecting to zero at %s\n", opt.ZeroAddr)
   115  	ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
   116  	defer cancel()
   117  
   118  	tlsConf, err := x.LoadClientTLSConfigForInternalPort(Bulk.Conf)
   119  	x.Check(err)
   120  	dialOpts := []grpc.DialOption{
   121  		grpc.WithBlock(),
   122  	}
   123  	if tlsConf != nil {
   124  		dialOpts = append(dialOpts, grpc.WithTransportCredentials(credentials.NewTLS(tlsConf)))
   125  	} else {
   126  		dialOpts = append(dialOpts, grpc.WithTransportCredentials(insecure.NewCredentials()))
   127  	}
   128  	zero, err := grpc.DialContext(ctx, opt.ZeroAddr, dialOpts...)
   129  	x.Checkf(err, "Unable to connect to zero, Is it running at %s?", opt.ZeroAddr)
   130  	st := &state{
   131  		opt:    opt,
   132  		prog:   newProgress(),
   133  		shards: newShardMap(opt.MapShards),
   134  		// Lots of gz readers, so not much channel buffer needed.
   135  		readerChunkCh: make(chan *bytes.Buffer, opt.NumGoroutines),
   136  		writeTs:       getWriteTimestamp(zero),
   137  		namespaces:    &sync.Map{},
   138  	}
   139  	st.schema = newSchemaStore(readSchema(opt), opt, st)
   140  	ld := &loader{
   141  		state:   st,
   142  		mappers: make([]*mapper, opt.NumGoroutines),
   143  		zero:    zero,
   144  	}
   145  	for i := 0; i < opt.NumGoroutines; i++ {
   146  		ld.mappers[i] = newMapper(st)
   147  	}
   148  	go ld.prog.report()
   149  	return ld
   150  }
   151  
   152  func getWriteTimestamp(zero *grpc.ClientConn) uint64 {
   153  	client := pb.NewZeroClient(zero)
   154  	for {
   155  		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   156  		ts, err := client.Timestamps(ctx, &pb.Num{Val: 1})
   157  		cancel()
   158  		if err == nil {
   159  			return ts.GetStartId()
   160  		}
   161  		fmt.Printf("Error communicating with dgraph zero, retrying: %v", err)
   162  		time.Sleep(time.Second)
   163  	}
   164  }
   165  
   166  // leaseNamespace is called at the end of map phase. It leases the namespace ids till the maximum
   167  // seen namespace id.
   168  func (ld *loader) leaseNamespaces() {
   169  	var maxNs uint64
   170  	ld.namespaces.Range(func(key, value interface{}) bool {
   171  		if ns := key.(uint64); ns > maxNs {
   172  			maxNs = ns
   173  		}
   174  		return true
   175  	})
   176  
   177  	// If only the default namespace is seen, do nothing.
   178  	if maxNs == 0 {
   179  		return
   180  	}
   181  
   182  	client := pb.NewZeroClient(ld.zero)
   183  	for {
   184  		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   185  		ns, err := client.AssignIds(ctx, &pb.Num{Val: maxNs, Type: pb.Num_NS_ID})
   186  		cancel()
   187  		if err == nil {
   188  			fmt.Printf("Assigned namespaces till %d", ns.GetEndId())
   189  			return
   190  		}
   191  		fmt.Printf("Error communicating with dgraph zero, retrying: %v", err)
   192  		time.Sleep(time.Second)
   193  	}
   194  }
   195  
   196  func readSchema(opt *options) *schema.ParsedSchema {
   197  	f, err := filestore.Open(opt.SchemaFile)
   198  	x.Check(err)
   199  	defer f.Close()
   200  
   201  	key := opt.EncryptionKey
   202  	if !opt.Encrypted {
   203  		key = nil
   204  	}
   205  	r, err := enc.GetReader(key, f)
   206  	x.Check(err)
   207  	if filepath.Ext(opt.SchemaFile) == ".gz" {
   208  		r, err = gzip.NewReader(r)
   209  		x.Check(err)
   210  	}
   211  
   212  	buf, err := io.ReadAll(r)
   213  	x.Check(err)
   214  
   215  	result, err := schema.ParseWithNamespace(string(buf), opt.Namespace)
   216  	x.Check(err)
   217  	return result
   218  }
   219  
   220  func (ld *loader) mapStage() {
   221  	ld.prog.setPhase(mapPhase)
   222  	var db *badger.DB
   223  	if len(ld.opt.ClientDir) > 0 {
   224  		x.Check(os.MkdirAll(ld.opt.ClientDir, 0700))
   225  
   226  		var err error
   227  		db, err = badger.Open(badger.DefaultOptions(ld.opt.ClientDir))
   228  		x.Checkf(err, "Error while creating badger KV posting store")
   229  	}
   230  	ld.xids = xidmap.New(xidmap.XidMapOptions{
   231  		UidAssigner: ld.zero,
   232  		DB:          db,
   233  		Dir:         filepath.Join(ld.opt.TmpDir, bufferDir),
   234  	})
   235  
   236  	fs := filestore.NewFileStore(ld.opt.DataFiles)
   237  
   238  	files := fs.FindDataFiles(ld.opt.DataFiles, []string{".rdf", ".rdf.gz", ".json", ".json.gz"})
   239  	if len(files) == 0 {
   240  		fmt.Printf("No data files found in %s.\n", ld.opt.DataFiles)
   241  		os.Exit(1)
   242  	}
   243  
   244  	// Because mappers must handle chunks that may be from different input files, they must all
   245  	// assume the same data format, either RDF or JSON. Use the one specified by the user or by
   246  	// the first load file.
   247  	loadType := chunker.DataFormat(files[0], ld.opt.DataFormat)
   248  	if loadType == chunker.UnknownFormat {
   249  		// Dont't try to detect JSON input in bulk loader.
   250  		fmt.Printf("Need --format=rdf or --format=json to load %s", files[0])
   251  		os.Exit(1)
   252  	}
   253  
   254  	var mapperWg sync.WaitGroup
   255  	mapperWg.Add(len(ld.mappers))
   256  	for _, m := range ld.mappers {
   257  		go func(m *mapper) {
   258  			m.run(loadType)
   259  			mapperWg.Done()
   260  		}(m)
   261  	}
   262  
   263  	// This is the main map loop.
   264  	thr := y.NewThrottle(ld.opt.NumGoroutines)
   265  	for i, file := range files {
   266  		x.Check(thr.Do())
   267  		fmt.Printf("Processing file (%d out of %d): %s\n", i+1, len(files), file)
   268  
   269  		go func(file string) {
   270  			defer thr.Done(nil)
   271  
   272  			key := ld.opt.EncryptionKey
   273  			if !ld.opt.Encrypted {
   274  				key = nil
   275  			}
   276  			r, cleanup := fs.ChunkReader(file, key)
   277  			defer cleanup()
   278  
   279  			chunk := chunker.NewChunker(loadType, 1000)
   280  			for {
   281  				chunkBuf, err := chunk.Chunk(r)
   282  				if chunkBuf != nil && chunkBuf.Len() > 0 {
   283  					ld.readerChunkCh <- chunkBuf
   284  				}
   285  				if err == io.EOF {
   286  					break
   287  				} else if err != nil {
   288  					x.Check(err)
   289  				}
   290  			}
   291  		}(file)
   292  	}
   293  	x.Check(thr.Finish())
   294  
   295  	// Send the graphql triples
   296  	ld.processGqlSchema(loadType)
   297  
   298  	close(ld.readerChunkCh)
   299  	mapperWg.Wait()
   300  
   301  	// Allow memory to GC before the reduce phase.
   302  	for i := range ld.mappers {
   303  		ld.mappers[i] = nil
   304  	}
   305  	x.Check(ld.xids.Flush())
   306  	if db != nil {
   307  		x.Check(db.Close())
   308  	}
   309  	ld.xids = nil
   310  }
   311  
   312  func parseGqlSchema(s string) map[uint64]string {
   313  	var schemas []x.ExportedGQLSchema
   314  	if err := json.Unmarshal([]byte(s), &schemas); err != nil {
   315  		fmt.Println("Error while decoding the graphql schema. Assuming it to be in format < 21.03.")
   316  		return map[uint64]string{x.GalaxyNamespace: s}
   317  	}
   318  
   319  	schemaMap := make(map[uint64]string)
   320  	for _, schema := range schemas {
   321  		if _, ok := schemaMap[schema.Namespace]; ok {
   322  			fmt.Printf("Found multiple GraphQL schema for namespace %d.", schema.Namespace)
   323  			continue
   324  		}
   325  		schemaMap[schema.Namespace] = schema.Schema
   326  	}
   327  	return schemaMap
   328  }
   329  
   330  func (ld *loader) processGqlSchema(loadType chunker.InputFormat) {
   331  	if ld.opt.GqlSchemaFile == "" {
   332  		return
   333  	}
   334  
   335  	f, err := filestore.Open(ld.opt.GqlSchemaFile)
   336  	x.Check(err)
   337  	defer f.Close()
   338  
   339  	key := ld.opt.EncryptionKey
   340  	if !ld.opt.Encrypted {
   341  		key = nil
   342  	}
   343  	r, err := enc.GetReader(key, f)
   344  	x.Check(err)
   345  	if filepath.Ext(ld.opt.GqlSchemaFile) == ".gz" {
   346  		r, err = gzip.NewReader(r)
   347  		x.Check(err)
   348  	}
   349  
   350  	buf, err := io.ReadAll(r)
   351  	x.Check(err)
   352  
   353  	rdfSchema := `_:gqlschema <dgraph.type> "dgraph.graphql" <%#x> .
   354  	_:gqlschema <dgraph.graphql.xid> "dgraph.graphql.schema" <%#x> .
   355  	_:gqlschema <dgraph.graphql.schema> %s <%#x> .
   356  	`
   357  
   358  	jsonSchema := `{
   359  		"namespace": "%#x",
   360  		"dgraph.type": "dgraph.graphql",
   361  		"dgraph.graphql.xid": "dgraph.graphql.schema",
   362  		"dgraph.graphql.schema": %s
   363  	}`
   364  
   365  	process := func(ns uint64, schema string) {
   366  		// Ignore the schema if the namespace is not already seen.
   367  		if _, ok := ld.schema.namespaces.Load(ns); !ok {
   368  			fmt.Printf("No data exist for namespace: %d. Cannot load the graphql schema.", ns)
   369  			return
   370  		}
   371  		gqlBuf := &bytes.Buffer{}
   372  		schema = strconv.Quote(schema)
   373  		switch loadType {
   374  		case chunker.RdfFormat:
   375  			x.Check2(gqlBuf.Write([]byte(fmt.Sprintf(rdfSchema, ns, ns, schema, ns))))
   376  		case chunker.JsonFormat:
   377  			x.Check2(gqlBuf.Write([]byte(fmt.Sprintf(jsonSchema, ns, schema))))
   378  		}
   379  		ld.readerChunkCh <- gqlBuf
   380  	}
   381  
   382  	schemas := parseGqlSchema(string(buf))
   383  	if ld.opt.Namespace == math.MaxUint64 {
   384  		// Preserve the namespace.
   385  		for ns, schema := range schemas {
   386  			process(ns, schema)
   387  		}
   388  		return
   389  	}
   390  
   391  	switch len(schemas) {
   392  	case 1:
   393  		// User might have exported from a different namespace. So, schema.Namespace will not be
   394  		// having the correct value.
   395  		for _, schema := range schemas {
   396  			process(ld.opt.Namespace, schema)
   397  		}
   398  	default:
   399  		if _, ok := schemas[ld.opt.Namespace]; !ok {
   400  			// We expect only a single GraphQL schema when loading into specfic namespace.
   401  			fmt.Printf("Didn't find GraphQL schema for namespace %d. Not loading GraphQL schema.",
   402  				ld.opt.Namespace)
   403  			return
   404  		}
   405  		process(ld.opt.Namespace, schemas[ld.opt.Namespace])
   406  	}
   407  }
   408  
   409  func (ld *loader) reduceStage() {
   410  	ld.prog.setPhase(reducePhase)
   411  
   412  	r := reducer{
   413  		state:     ld.state,
   414  		streamIds: make(map[string]uint32),
   415  	}
   416  	x.Check(r.run())
   417  }
   418  
   419  func (ld *loader) writeSchema() {
   420  	numDBs := uint32(len(ld.dbs))
   421  	preds := make([][]string, numDBs)
   422  
   423  	// Get all predicates that have data in some DB.
   424  	m := make(map[string]struct{})
   425  	for i, db := range ld.dbs {
   426  		preds[i] = ld.schema.getPredicates(db)
   427  		for _, p := range preds[i] {
   428  			m[p] = struct{}{}
   429  		}
   430  	}
   431  
   432  	// Find any predicates that don't have data in any DB
   433  	// and distribute them among all the DBs.
   434  	for p := range ld.schema.schemaMap {
   435  		if _, ok := m[p]; !ok {
   436  			i := adler32.Checksum([]byte(p)) % numDBs
   437  			preds[i] = append(preds[i], p)
   438  		}
   439  	}
   440  
   441  	// Write out each DB's final predicate list.
   442  	for i, db := range ld.dbs {
   443  		ld.schema.write(db, preds[i])
   444  	}
   445  }
   446  
   447  func (ld *loader) cleanup() {
   448  	for _, db := range ld.dbs {
   449  		x.Check(db.Close())
   450  	}
   451  	for _, db := range ld.tmpDbs {
   452  		opts := db.Opts()
   453  		x.Check(db.Close())
   454  		x.Check(os.RemoveAll(opts.Dir))
   455  	}
   456  	ld.prog.endSummary()
   457  }