github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/dgraph/cmd/bulk/loader.go (about) 1 /* 2 * Copyright 2017-2018 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package bulk 18 19 import ( 20 "bytes" 21 "compress/gzip" 22 "context" 23 "fmt" 24 "hash/adler32" 25 "io" 26 "io/ioutil" 27 "os" 28 "path/filepath" 29 "sync" 30 "time" 31 32 "github.com/dgraph-io/badger" 33 "github.com/dgraph-io/badger/y" 34 35 "github.com/dgraph-io/dgraph/chunker" 36 "github.com/dgraph-io/dgraph/protos/pb" 37 "github.com/dgraph-io/dgraph/schema" 38 "github.com/dgraph-io/dgraph/x" 39 "github.com/dgraph-io/dgraph/xidmap" 40 41 "google.golang.org/grpc" 42 ) 43 44 type options struct { 45 DataFiles string 46 DataFormat string 47 SchemaFile string 48 OutDir string 49 ReplaceOutDir bool 50 TmpDir string 51 NumGoroutines int 52 MapBufSize uint64 53 SkipMapPhase bool 54 CleanupTmp bool 55 NumReducers int 56 Version bool 57 StoreXids bool 58 ZeroAddr string 59 HttpAddr string 60 IgnoreErrors bool 61 CustomTokenizers string 62 NewUids bool 63 64 MapShards int 65 ReduceShards int 66 67 shardOutputDirs []string 68 } 69 70 type state struct { 71 opt options 72 prog *progress 73 xids *xidmap.XidMap 74 schema *schemaStore 75 shards *shardMap 76 readerChunkCh chan *bytes.Buffer 77 mapFileId uint32 // Used atomically to name the output files of the mappers. 78 dbs []*badger.DB 79 writeTs uint64 // All badger writes use this timestamp 80 } 81 82 type loader struct { 83 *state 84 mappers []*mapper 85 zero *grpc.ClientConn 86 } 87 88 func newLoader(opt options) *loader { 89 fmt.Printf("Connecting to zero at %s\n", opt.ZeroAddr) 90 91 ctx, cancel := context.WithTimeout(context.Background(), time.Minute) 92 defer cancel() 93 94 zero, err := grpc.DialContext(ctx, opt.ZeroAddr, 95 grpc.WithBlock(), 96 grpc.WithInsecure()) 97 x.Checkf(err, "Unable to connect to zero, Is it running at %s?", opt.ZeroAddr) 98 st := &state{ 99 opt: opt, 100 prog: newProgress(), 101 shards: newShardMap(opt.MapShards), 102 // Lots of gz readers, so not much channel buffer needed. 103 readerChunkCh: make(chan *bytes.Buffer, opt.NumGoroutines), 104 writeTs: getWriteTimestamp(zero), 105 } 106 st.schema = newSchemaStore(readSchema(opt.SchemaFile), opt, st) 107 ld := &loader{ 108 state: st, 109 mappers: make([]*mapper, opt.NumGoroutines), 110 zero: zero, 111 } 112 for i := 0; i < opt.NumGoroutines; i++ { 113 ld.mappers[i] = newMapper(st) 114 } 115 go ld.prog.report() 116 return ld 117 } 118 119 func getWriteTimestamp(zero *grpc.ClientConn) uint64 { 120 client := pb.NewZeroClient(zero) 121 for { 122 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 123 ts, err := client.Timestamps(ctx, &pb.Num{Val: 1}) 124 cancel() 125 if err == nil { 126 return ts.GetStartId() 127 } 128 fmt.Printf("Error communicating with dgraph zero, retrying: %v", err) 129 time.Sleep(time.Second) 130 } 131 } 132 133 func readSchema(filename string) *schema.ParsedSchema { 134 f, err := os.Open(filename) 135 x.Check(err) 136 defer f.Close() 137 var r io.Reader = f 138 if filepath.Ext(filename) == ".gz" { 139 r, err = gzip.NewReader(f) 140 x.Check(err) 141 } 142 143 buf, err := ioutil.ReadAll(r) 144 x.Check(err) 145 146 result, err := schema.Parse(string(buf)) 147 x.Check(err) 148 return result 149 } 150 151 func (ld *loader) mapStage() { 152 ld.prog.setPhase(mapPhase) 153 ld.xids = xidmap.New(ld.zero, nil) 154 155 files := x.FindDataFiles(ld.opt.DataFiles, []string{".rdf", ".rdf.gz", ".json", ".json.gz"}) 156 if len(files) == 0 { 157 fmt.Printf("No data files found in %s.\n", ld.opt.DataFiles) 158 os.Exit(1) 159 } 160 161 // Because mappers must handle chunks that may be from different input files, they must all 162 // assume the same data format, either RDF or JSON. Use the one specified by the user or by 163 // the first load file. 164 loadType := chunker.DataFormat(files[0], ld.opt.DataFormat) 165 if loadType == chunker.UnknownFormat { 166 // Dont't try to detect JSON input in bulk loader. 167 fmt.Printf("Need --format=rdf or --format=json to load %s", files[0]) 168 os.Exit(1) 169 } 170 171 var mapperWg sync.WaitGroup 172 mapperWg.Add(len(ld.mappers)) 173 for _, m := range ld.mappers { 174 go func(m *mapper) { 175 m.run(loadType) 176 mapperWg.Done() 177 }(m) 178 } 179 180 // This is the main map loop. 181 thr := y.NewThrottle(ld.opt.NumGoroutines) 182 for i, file := range files { 183 x.Check(thr.Do()) 184 fmt.Printf("Processing file (%d out of %d): %s\n", i+1, len(files), file) 185 186 go func(file string) { 187 defer thr.Done(nil) 188 189 r, cleanup := chunker.FileReader(file) 190 defer cleanup() 191 192 chunker := chunker.NewChunker(loadType, 1000) 193 for { 194 chunkBuf, err := chunker.Chunk(r) 195 if chunkBuf != nil && chunkBuf.Len() > 0 { 196 ld.readerChunkCh <- chunkBuf 197 } 198 if err == io.EOF { 199 break 200 } else if err != nil { 201 x.Check(err) 202 } 203 } 204 }(file) 205 } 206 x.Check(thr.Finish()) 207 208 close(ld.readerChunkCh) 209 mapperWg.Wait() 210 211 // Allow memory to GC before the reduce phase. 212 for i := range ld.mappers { 213 ld.mappers[i] = nil 214 } 215 x.Check(ld.xids.Flush()) 216 ld.xids = nil 217 } 218 219 func (ld *loader) reduceStage() { 220 ld.prog.setPhase(reducePhase) 221 222 r := reducer{state: ld.state} 223 x.Check(r.run()) 224 } 225 226 func (ld *loader) writeSchema() { 227 numDBs := uint32(len(ld.dbs)) 228 preds := make([][]string, numDBs) 229 230 // Get all predicates that have data in some DB. 231 m := make(map[string]struct{}) 232 for i, db := range ld.dbs { 233 preds[i] = ld.schema.getPredicates(db) 234 for _, p := range preds[i] { 235 m[p] = struct{}{} 236 } 237 } 238 239 // Find any predicates that don't have data in any DB 240 // and distribute them among all the DBs. 241 for p := range ld.schema.schemaMap { 242 if _, ok := m[p]; !ok { 243 i := adler32.Checksum([]byte(p)) % numDBs 244 preds[i] = append(preds[i], p) 245 } 246 } 247 248 // Write out each DB's final predicate list. 249 for i, db := range ld.dbs { 250 ld.schema.write(db, preds[i]) 251 } 252 } 253 254 func (ld *loader) cleanup() { 255 for _, db := range ld.dbs { 256 x.Check(db.Close()) 257 } 258 ld.prog.endSummary() 259 }