github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/dgraph/cmd/bulk/loader.go (about)

     1  /*
     2   * Copyright 2017-2018 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package bulk
    18  
    19  import (
    20  	"bytes"
    21  	"compress/gzip"
    22  	"context"
    23  	"fmt"
    24  	"hash/adler32"
    25  	"io"
    26  	"io/ioutil"
    27  	"os"
    28  	"path/filepath"
    29  	"sync"
    30  	"time"
    31  
    32  	"github.com/dgraph-io/badger"
    33  	"github.com/dgraph-io/badger/y"
    34  
    35  	"github.com/dgraph-io/dgraph/chunker"
    36  	"github.com/dgraph-io/dgraph/protos/pb"
    37  	"github.com/dgraph-io/dgraph/schema"
    38  	"github.com/dgraph-io/dgraph/x"
    39  	"github.com/dgraph-io/dgraph/xidmap"
    40  
    41  	"google.golang.org/grpc"
    42  )
    43  
    44  type options struct {
    45  	DataFiles        string
    46  	DataFormat       string
    47  	SchemaFile       string
    48  	OutDir           string
    49  	ReplaceOutDir    bool
    50  	TmpDir           string
    51  	NumGoroutines    int
    52  	MapBufSize       uint64
    53  	SkipMapPhase     bool
    54  	CleanupTmp       bool
    55  	NumReducers      int
    56  	Version          bool
    57  	StoreXids        bool
    58  	ZeroAddr         string
    59  	HttpAddr         string
    60  	IgnoreErrors     bool
    61  	CustomTokenizers string
    62  	NewUids          bool
    63  
    64  	MapShards    int
    65  	ReduceShards int
    66  
    67  	shardOutputDirs []string
    68  }
    69  
    70  type state struct {
    71  	opt           options
    72  	prog          *progress
    73  	xids          *xidmap.XidMap
    74  	schema        *schemaStore
    75  	shards        *shardMap
    76  	readerChunkCh chan *bytes.Buffer
    77  	mapFileId     uint32 // Used atomically to name the output files of the mappers.
    78  	dbs           []*badger.DB
    79  	writeTs       uint64 // All badger writes use this timestamp
    80  }
    81  
    82  type loader struct {
    83  	*state
    84  	mappers []*mapper
    85  	zero    *grpc.ClientConn
    86  }
    87  
    88  func newLoader(opt options) *loader {
    89  	fmt.Printf("Connecting to zero at %s\n", opt.ZeroAddr)
    90  
    91  	ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
    92  	defer cancel()
    93  
    94  	zero, err := grpc.DialContext(ctx, opt.ZeroAddr,
    95  		grpc.WithBlock(),
    96  		grpc.WithInsecure())
    97  	x.Checkf(err, "Unable to connect to zero, Is it running at %s?", opt.ZeroAddr)
    98  	st := &state{
    99  		opt:    opt,
   100  		prog:   newProgress(),
   101  		shards: newShardMap(opt.MapShards),
   102  		// Lots of gz readers, so not much channel buffer needed.
   103  		readerChunkCh: make(chan *bytes.Buffer, opt.NumGoroutines),
   104  		writeTs:       getWriteTimestamp(zero),
   105  	}
   106  	st.schema = newSchemaStore(readSchema(opt.SchemaFile), opt, st)
   107  	ld := &loader{
   108  		state:   st,
   109  		mappers: make([]*mapper, opt.NumGoroutines),
   110  		zero:    zero,
   111  	}
   112  	for i := 0; i < opt.NumGoroutines; i++ {
   113  		ld.mappers[i] = newMapper(st)
   114  	}
   115  	go ld.prog.report()
   116  	return ld
   117  }
   118  
   119  func getWriteTimestamp(zero *grpc.ClientConn) uint64 {
   120  	client := pb.NewZeroClient(zero)
   121  	for {
   122  		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   123  		ts, err := client.Timestamps(ctx, &pb.Num{Val: 1})
   124  		cancel()
   125  		if err == nil {
   126  			return ts.GetStartId()
   127  		}
   128  		fmt.Printf("Error communicating with dgraph zero, retrying: %v", err)
   129  		time.Sleep(time.Second)
   130  	}
   131  }
   132  
   133  func readSchema(filename string) *schema.ParsedSchema {
   134  	f, err := os.Open(filename)
   135  	x.Check(err)
   136  	defer f.Close()
   137  	var r io.Reader = f
   138  	if filepath.Ext(filename) == ".gz" {
   139  		r, err = gzip.NewReader(f)
   140  		x.Check(err)
   141  	}
   142  
   143  	buf, err := ioutil.ReadAll(r)
   144  	x.Check(err)
   145  
   146  	result, err := schema.Parse(string(buf))
   147  	x.Check(err)
   148  	return result
   149  }
   150  
   151  func (ld *loader) mapStage() {
   152  	ld.prog.setPhase(mapPhase)
   153  	ld.xids = xidmap.New(ld.zero, nil)
   154  
   155  	files := x.FindDataFiles(ld.opt.DataFiles, []string{".rdf", ".rdf.gz", ".json", ".json.gz"})
   156  	if len(files) == 0 {
   157  		fmt.Printf("No data files found in %s.\n", ld.opt.DataFiles)
   158  		os.Exit(1)
   159  	}
   160  
   161  	// Because mappers must handle chunks that may be from different input files, they must all
   162  	// assume the same data format, either RDF or JSON. Use the one specified by the user or by
   163  	// the first load file.
   164  	loadType := chunker.DataFormat(files[0], ld.opt.DataFormat)
   165  	if loadType == chunker.UnknownFormat {
   166  		// Dont't try to detect JSON input in bulk loader.
   167  		fmt.Printf("Need --format=rdf or --format=json to load %s", files[0])
   168  		os.Exit(1)
   169  	}
   170  
   171  	var mapperWg sync.WaitGroup
   172  	mapperWg.Add(len(ld.mappers))
   173  	for _, m := range ld.mappers {
   174  		go func(m *mapper) {
   175  			m.run(loadType)
   176  			mapperWg.Done()
   177  		}(m)
   178  	}
   179  
   180  	// This is the main map loop.
   181  	thr := y.NewThrottle(ld.opt.NumGoroutines)
   182  	for i, file := range files {
   183  		x.Check(thr.Do())
   184  		fmt.Printf("Processing file (%d out of %d): %s\n", i+1, len(files), file)
   185  
   186  		go func(file string) {
   187  			defer thr.Done(nil)
   188  
   189  			r, cleanup := chunker.FileReader(file)
   190  			defer cleanup()
   191  
   192  			chunker := chunker.NewChunker(loadType, 1000)
   193  			for {
   194  				chunkBuf, err := chunker.Chunk(r)
   195  				if chunkBuf != nil && chunkBuf.Len() > 0 {
   196  					ld.readerChunkCh <- chunkBuf
   197  				}
   198  				if err == io.EOF {
   199  					break
   200  				} else if err != nil {
   201  					x.Check(err)
   202  				}
   203  			}
   204  		}(file)
   205  	}
   206  	x.Check(thr.Finish())
   207  
   208  	close(ld.readerChunkCh)
   209  	mapperWg.Wait()
   210  
   211  	// Allow memory to GC before the reduce phase.
   212  	for i := range ld.mappers {
   213  		ld.mappers[i] = nil
   214  	}
   215  	x.Check(ld.xids.Flush())
   216  	ld.xids = nil
   217  }
   218  
   219  func (ld *loader) reduceStage() {
   220  	ld.prog.setPhase(reducePhase)
   221  
   222  	r := reducer{state: ld.state}
   223  	x.Check(r.run())
   224  }
   225  
   226  func (ld *loader) writeSchema() {
   227  	numDBs := uint32(len(ld.dbs))
   228  	preds := make([][]string, numDBs)
   229  
   230  	// Get all predicates that have data in some DB.
   231  	m := make(map[string]struct{})
   232  	for i, db := range ld.dbs {
   233  		preds[i] = ld.schema.getPredicates(db)
   234  		for _, p := range preds[i] {
   235  			m[p] = struct{}{}
   236  		}
   237  	}
   238  
   239  	// Find any predicates that don't have data in any DB
   240  	// and distribute them among all the DBs.
   241  	for p := range ld.schema.schemaMap {
   242  		if _, ok := m[p]; !ok {
   243  			i := adler32.Checksum([]byte(p)) % numDBs
   244  			preds[i] = append(preds[i], p)
   245  		}
   246  	}
   247  
   248  	// Write out each DB's final predicate list.
   249  	for i, db := range ld.dbs {
   250  		ld.schema.write(db, preds[i])
   251  	}
   252  }
   253  
   254  func (ld *loader) cleanup() {
   255  	for _, db := range ld.dbs {
   256  		x.Check(db.Close())
   257  	}
   258  	ld.prog.endSummary()
   259  }