github.hscsec.cn/dgraph-io/dgraph@v1.1.0/dgraph/cmd/live/run.go (about)

     1  /*
     2   * Copyright 2017-2018 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package live
    18  
    19  import (
    20  	"bufio"
    21  	"compress/gzip"
    22  	"context"
    23  	"crypto/tls"
    24  	"fmt"
    25  	"io"
    26  	"io/ioutil"
    27  	"math"
    28  	"math/rand"
    29  	"net/http"
    30  	_ "net/http/pprof" // http profiler
    31  	"os"
    32  	"strconv"
    33  	"strings"
    34  	"sync"
    35  	"time"
    36  
    37  	"google.golang.org/grpc/metadata"
    38  
    39  	"github.com/dgraph-io/badger"
    40  	bopt "github.com/dgraph-io/badger/options"
    41  	"github.com/dgraph-io/dgo"
    42  	"github.com/dgraph-io/dgo/protos/api"
    43  
    44  	"github.com/dgraph-io/dgraph/chunker"
    45  	"github.com/dgraph-io/dgraph/x"
    46  	"github.com/dgraph-io/dgraph/xidmap"
    47  
    48  	"github.com/golang/glog"
    49  	"github.com/pkg/errors"
    50  	"github.com/spf13/cobra"
    51  )
    52  
    53  type options struct {
    54  	dataFiles      string
    55  	dataFormat     string
    56  	schemaFile     string
    57  	zero           string
    58  	concurrent     int
    59  	batchSize      int
    60  	clientDir      string
    61  	authToken      string
    62  	useCompression bool
    63  	newUids        bool
    64  	verbose        bool
    65  }
    66  
    67  var (
    68  	opt    options
    69  	tlsCfg *tls.Config
    70  	// Live is the sub-command invoked when running "dgraph live".
    71  	Live x.SubCommand
    72  )
    73  
    74  func init() {
    75  	Live.Cmd = &cobra.Command{
    76  		Use:   "live",
    77  		Short: "Run Dgraph live loader",
    78  		Run: func(cmd *cobra.Command, args []string) {
    79  			defer x.StartProfile(Live.Conf).Stop()
    80  			if err := run(); err != nil {
    81  				os.Exit(1)
    82  			}
    83  		},
    84  	}
    85  	Live.EnvPrefix = "DGRAPH_LIVE"
    86  
    87  	flag := Live.Cmd.Flags()
    88  	flag.StringP("files", "f", "", "Location of *.rdf(.gz) or *.json(.gz) file(s) to load")
    89  	flag.StringP("schema", "s", "", "Location of schema file")
    90  	flag.String("format", "", "Specify file format (rdf or json) instead of getting it from filename")
    91  	flag.StringP("alpha", "a", "127.0.0.1:9080",
    92  		"Comma-separated list of Dgraph alpha gRPC server addresses")
    93  	flag.StringP("zero", "z", "127.0.0.1:5080", "Dgraph zero gRPC server address")
    94  	flag.IntP("conc", "c", 10,
    95  		"Number of concurrent requests to make to Dgraph")
    96  	flag.IntP("batch", "b", 1000,
    97  		"Number of N-Quads to send as part of a mutation.")
    98  	flag.StringP("xidmap", "x", "", "Directory to store xid to uid mapping")
    99  	flag.StringP("auth_token", "t", "",
   100  		"The auth token passed to the server for Alter operation of the schema file")
   101  	flag.BoolP("use_compression", "C", false,
   102  		"Enable compression on connection to alpha server")
   103  	flag.Bool("new_uids", false,
   104  		"Ignore UIDs in load files and assign new ones.")
   105  	flag.Bool("verbose", false, "Run the live loader in verbose mode")
   106  	flag.StringP("user", "u", "", "Username if login is required.")
   107  	flag.StringP("password", "p", "", "Password of the user.")
   108  
   109  	// TLS configuration
   110  	x.RegisterClientTLSFlags(flag)
   111  }
   112  
   113  // processSchemaFile process schema for a given gz file.
   114  func processSchemaFile(ctx context.Context, file string, dgraphClient *dgo.Dgraph) error {
   115  	fmt.Printf("\nProcessing schema file %q\n", file)
   116  	if len(opt.authToken) > 0 {
   117  		md := metadata.New(nil)
   118  		md.Append("auth-token", opt.authToken)
   119  		ctx = metadata.NewOutgoingContext(ctx, md)
   120  	}
   121  
   122  	f, err := os.Open(file)
   123  	x.CheckfNoTrace(err)
   124  	defer f.Close()
   125  
   126  	var reader io.Reader
   127  	if strings.HasSuffix(strings.ToLower(file), ".gz") {
   128  		reader, err = gzip.NewReader(f)
   129  		x.Check(err)
   130  	} else {
   131  		reader = f
   132  	}
   133  
   134  	b, err := ioutil.ReadAll(reader)
   135  	if err != nil {
   136  		x.Checkf(err, "Error while reading file")
   137  	}
   138  
   139  	op := &api.Operation{}
   140  	op.Schema = string(b)
   141  	return dgraphClient.Alter(ctx, op)
   142  }
   143  
   144  func (l *loader) uid(val string) string {
   145  	// Attempt to parse as a UID (in the same format that dgraph outputs - a
   146  	// hex number prefixed by "0x"). If parsing succeeds, then this is assumed
   147  	// to be an existing node in the graph. There is limited protection against
   148  	// a user selecting an unassigned UID in this way - it may be assigned
   149  	// later to another node. It is up to the user to avoid this.
   150  	if !opt.newUids {
   151  		if uid, err := strconv.ParseUint(val, 0, 64); err == nil {
   152  			l.alloc.BumpTo(uid)
   153  			return fmt.Sprintf("%#x", uid)
   154  		}
   155  	}
   156  
   157  	uid := l.alloc.AssignUid(val)
   158  	return fmt.Sprintf("%#x", uint64(uid))
   159  }
   160  
   161  // processFile forwards a file to the RDF or JSON processor as appropriate
   162  func (l *loader) processFile(ctx context.Context, filename string) error {
   163  	fmt.Printf("Processing data file %q\n", filename)
   164  
   165  	rd, cleanup := chunker.FileReader(filename)
   166  	defer cleanup()
   167  
   168  	loadType := chunker.DataFormat(filename, opt.dataFormat)
   169  	if loadType == chunker.UnknownFormat {
   170  		if isJson, err := chunker.IsJSONData(rd); err == nil {
   171  			if isJson {
   172  				loadType = chunker.JsonFormat
   173  			} else {
   174  				return errors.Errorf("need --format=rdf or --format=json to load %s", filename)
   175  			}
   176  		}
   177  	}
   178  
   179  	return l.processLoadFile(ctx, rd, chunker.NewChunker(loadType, opt.batchSize))
   180  }
   181  
   182  func (l *loader) processLoadFile(ctx context.Context, rd *bufio.Reader, ck chunker.Chunker) error {
   183  	var wg sync.WaitGroup
   184  	wg.Add(1)
   185  	nqbuf := ck.NQuads()
   186  	// Spin a goroutine to push NQuads to mutation channel.
   187  	go func() {
   188  		defer wg.Done()
   189  		for nqs := range nqbuf.Ch() {
   190  			if len(nqs) == 0 {
   191  				continue
   192  			}
   193  			for _, nq := range nqs {
   194  				nq.Subject = l.uid(nq.Subject)
   195  				if len(nq.ObjectId) > 0 {
   196  					nq.ObjectId = l.uid(nq.ObjectId)
   197  				}
   198  			}
   199  
   200  			mu := api.Mutation{Set: nqs}
   201  			l.reqs <- mu
   202  		}
   203  	}()
   204  
   205  	for {
   206  		select {
   207  		case <-ctx.Done():
   208  			return ctx.Err()
   209  		default:
   210  		}
   211  
   212  		chunkBuf, err := ck.Chunk(rd)
   213  		// Parses the rdf entries from the chunk, groups them into batches (each one
   214  		// containing opt.batchSize entries) and sends the batches to the loader.reqs channel (see
   215  		// above).
   216  		if oerr := ck.Parse(chunkBuf); oerr != nil {
   217  			return errors.Wrap(oerr, "During parsing chunk in processLoadFile")
   218  		}
   219  		if err == io.EOF {
   220  			break
   221  		} else {
   222  			x.Check(err)
   223  		}
   224  	}
   225  	nqbuf.Flush()
   226  	wg.Wait()
   227  
   228  	return nil
   229  }
   230  
   231  func setup(opts batchMutationOptions, dc *dgo.Dgraph) *loader {
   232  	var db *badger.DB
   233  	if len(opt.clientDir) > 0 {
   234  		x.Check(os.MkdirAll(opt.clientDir, 0700))
   235  
   236  		var err error
   237  		db, err = badger.Open(badger.DefaultOptions(opt.clientDir).
   238  			WithTableLoadingMode(bopt.MemoryMap).
   239  			WithSyncWrites(false))
   240  		x.Checkf(err, "Error while creating badger KV posting store")
   241  	}
   242  
   243  	// compression with zero server actually makes things worse
   244  	connzero, err := x.SetupConnection(opt.zero, tlsCfg, false)
   245  	x.Checkf(err, "Unable to connect to zero, Is it running at %s?", opt.zero)
   246  
   247  	alloc := xidmap.New(connzero, db)
   248  	l := &loader{
   249  		opts:     opts,
   250  		dc:       dc,
   251  		start:    time.Now(),
   252  		reqs:     make(chan api.Mutation, opts.Pending*2),
   253  		alloc:    alloc,
   254  		db:       db,
   255  		zeroconn: connzero,
   256  	}
   257  
   258  	l.requestsWg.Add(opts.Pending)
   259  	for i := 0; i < opts.Pending; i++ {
   260  		go l.makeRequests()
   261  	}
   262  
   263  	rand.Seed(time.Now().Unix())
   264  	return l
   265  }
   266  
   267  func run() error {
   268  	x.PrintVersion()
   269  	opt = options{
   270  		dataFiles:      Live.Conf.GetString("files"),
   271  		dataFormat:     Live.Conf.GetString("format"),
   272  		schemaFile:     Live.Conf.GetString("schema"),
   273  		zero:           Live.Conf.GetString("zero"),
   274  		concurrent:     Live.Conf.GetInt("conc"),
   275  		batchSize:      Live.Conf.GetInt("batch"),
   276  		clientDir:      Live.Conf.GetString("xidmap"),
   277  		authToken:      Live.Conf.GetString("auth_token"),
   278  		useCompression: Live.Conf.GetBool("use_compression"),
   279  		newUids:        Live.Conf.GetBool("new_uids"),
   280  		verbose:        Live.Conf.GetBool("verbose"),
   281  	}
   282  	go func() {
   283  		if err := http.ListenAndServe("localhost:6060", nil); err != nil {
   284  			glog.Errorf("Error while starting HTTP server in port 6060: %+v", err)
   285  		}
   286  	}()
   287  	ctx := context.Background()
   288  	bmOpts := batchMutationOptions{
   289  		Size:          opt.batchSize,
   290  		Pending:       opt.concurrent,
   291  		PrintCounters: true,
   292  		Ctx:           ctx,
   293  		MaxRetries:    math.MaxUint32,
   294  	}
   295  
   296  	dg, closeFunc := x.GetDgraphClient(Live.Conf, true)
   297  	defer closeFunc()
   298  
   299  	l := setup(bmOpts, dg)
   300  	defer l.zeroconn.Close()
   301  
   302  	if len(opt.schemaFile) > 0 {
   303  		if err := processSchemaFile(ctx, opt.schemaFile, dg); err != nil {
   304  			if err == context.Canceled {
   305  				fmt.Printf("Interrupted while processing schema file %q\n", opt.schemaFile)
   306  				return nil
   307  			}
   308  			fmt.Printf("Error while processing schema file %q: %s\n", opt.schemaFile, err)
   309  			return err
   310  		}
   311  		fmt.Printf("Processed schema file %q\n\n", opt.schemaFile)
   312  	}
   313  
   314  	if opt.dataFiles == "" {
   315  		return errors.New("RDF or JSON file(s) location must be specified")
   316  	}
   317  
   318  	filesList := x.FindDataFiles(opt.dataFiles, []string{".rdf", ".rdf.gz", ".json", ".json.gz"})
   319  	totalFiles := len(filesList)
   320  	if totalFiles == 0 {
   321  		return errors.Errorf("No data files found in %s", opt.dataFiles)
   322  	}
   323  	fmt.Printf("Found %d data file(s) to process\n", totalFiles)
   324  
   325  	//	x.Check(dgraphClient.NewSyncMarks(filesList))
   326  	errCh := make(chan error, totalFiles)
   327  	for _, file := range filesList {
   328  		file = strings.Trim(file, " \t")
   329  		go func(file string) {
   330  			errCh <- l.processFile(ctx, file)
   331  		}(file)
   332  	}
   333  
   334  	// PrintCounters should be called after schema has been updated.
   335  	if bmOpts.PrintCounters {
   336  		go l.printCounters()
   337  	}
   338  
   339  	for i := 0; i < totalFiles; i++ {
   340  		if err := <-errCh; err != nil {
   341  			fmt.Printf("Error while processing data file %q: %s\n", filesList[i], err)
   342  			return err
   343  		}
   344  	}
   345  
   346  	close(l.reqs)
   347  	// First we wait for requestsWg, when it is done we know all retry requests have been added
   348  	// to retryRequestsWg. We can't have the same waitgroup as by the time we call Wait, we can't
   349  	// be sure that all retry requests have been added to the waitgroup.
   350  	l.requestsWg.Wait()
   351  	l.retryRequestsWg.Wait()
   352  	c := l.Counter()
   353  	var rate uint64
   354  	if c.Elapsed.Seconds() < 1 {
   355  		rate = c.Nquads
   356  	} else {
   357  		rate = c.Nquads / uint64(c.Elapsed.Seconds())
   358  	}
   359  	// Lets print an empty line, otherwise Interrupted or Number of Mutations overwrites the
   360  	// previous printed line.
   361  	fmt.Printf("%100s\r", "")
   362  	fmt.Printf("Number of TXs run            : %d\n", c.TxnsDone)
   363  	fmt.Printf("Number of N-Quads processed  : %d\n", c.Nquads)
   364  	fmt.Printf("Time spent                   : %v\n", c.Elapsed)
   365  	fmt.Printf("N-Quads processed per second : %d\n", rate)
   366  
   367  	if l.db != nil {
   368  		l.alloc.Flush()
   369  		l.db.Close()
   370  	}
   371  	return nil
   372  }