github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/dgraph/cmd/live/run.go (about) 1 /* 2 * Copyright 2017-2018 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package live 18 19 import ( 20 "bufio" 21 "compress/gzip" 22 "context" 23 "crypto/tls" 24 "fmt" 25 "io" 26 "io/ioutil" 27 "math" 28 "math/rand" 29 "net/http" 30 _ "net/http/pprof" // http profiler 31 "os" 32 "strconv" 33 "strings" 34 "sync" 35 "time" 36 37 "google.golang.org/grpc/metadata" 38 39 "github.com/dgraph-io/badger" 40 bopt "github.com/dgraph-io/badger/options" 41 "github.com/dgraph-io/dgo" 42 "github.com/dgraph-io/dgo/protos/api" 43 44 "github.com/dgraph-io/dgraph/chunker" 45 "github.com/dgraph-io/dgraph/x" 46 "github.com/dgraph-io/dgraph/xidmap" 47 48 "github.com/golang/glog" 49 "github.com/pkg/errors" 50 "github.com/spf13/cobra" 51 ) 52 53 type options struct { 54 dataFiles string 55 dataFormat string 56 schemaFile string 57 zero string 58 concurrent int 59 batchSize int 60 clientDir string 61 authToken string 62 useCompression bool 63 newUids bool 64 verbose bool 65 } 66 67 var ( 68 opt options 69 tlsCfg *tls.Config 70 // Live is the sub-command invoked when running "dgraph live". 71 Live x.SubCommand 72 ) 73 74 func init() { 75 Live.Cmd = &cobra.Command{ 76 Use: "live", 77 Short: "Run Dgraph live loader", 78 Run: func(cmd *cobra.Command, args []string) { 79 defer x.StartProfile(Live.Conf).Stop() 80 if err := run(); err != nil { 81 os.Exit(1) 82 } 83 }, 84 } 85 Live.EnvPrefix = "DGRAPH_LIVE" 86 87 flag := Live.Cmd.Flags() 88 flag.StringP("files", "f", "", "Location of *.rdf(.gz) or *.json(.gz) file(s) to load") 89 flag.StringP("schema", "s", "", "Location of schema file") 90 flag.String("format", "", "Specify file format (rdf or json) instead of getting it from filename") 91 flag.StringP("alpha", "a", "127.0.0.1:9080", 92 "Comma-separated list of Dgraph alpha gRPC server addresses") 93 flag.StringP("zero", "z", "127.0.0.1:5080", "Dgraph zero gRPC server address") 94 flag.IntP("conc", "c", 10, 95 "Number of concurrent requests to make to Dgraph") 96 flag.IntP("batch", "b", 1000, 97 "Number of N-Quads to send as part of a mutation.") 98 flag.StringP("xidmap", "x", "", "Directory to store xid to uid mapping") 99 flag.StringP("auth_token", "t", "", 100 "The auth token passed to the server for Alter operation of the schema file") 101 flag.BoolP("use_compression", "C", false, 102 "Enable compression on connection to alpha server") 103 flag.Bool("new_uids", false, 104 "Ignore UIDs in load files and assign new ones.") 105 flag.Bool("verbose", false, "Run the live loader in verbose mode") 106 flag.StringP("user", "u", "", "Username if login is required.") 107 flag.StringP("password", "p", "", "Password of the user.") 108 109 // TLS configuration 110 x.RegisterClientTLSFlags(flag) 111 } 112 113 // processSchemaFile process schema for a given gz file. 114 func processSchemaFile(ctx context.Context, file string, dgraphClient *dgo.Dgraph) error { 115 fmt.Printf("\nProcessing schema file %q\n", file) 116 if len(opt.authToken) > 0 { 117 md := metadata.New(nil) 118 md.Append("auth-token", opt.authToken) 119 ctx = metadata.NewOutgoingContext(ctx, md) 120 } 121 122 f, err := os.Open(file) 123 x.CheckfNoTrace(err) 124 defer f.Close() 125 126 var reader io.Reader 127 if strings.HasSuffix(strings.ToLower(file), ".gz") { 128 reader, err = gzip.NewReader(f) 129 x.Check(err) 130 } else { 131 reader = f 132 } 133 134 b, err := ioutil.ReadAll(reader) 135 if err != nil { 136 x.Checkf(err, "Error while reading file") 137 } 138 139 op := &api.Operation{} 140 op.Schema = string(b) 141 return dgraphClient.Alter(ctx, op) 142 } 143 144 func (l *loader) uid(val string) string { 145 // Attempt to parse as a UID (in the same format that dgraph outputs - a 146 // hex number prefixed by "0x"). If parsing succeeds, then this is assumed 147 // to be an existing node in the graph. There is limited protection against 148 // a user selecting an unassigned UID in this way - it may be assigned 149 // later to another node. It is up to the user to avoid this. 150 if !opt.newUids { 151 if uid, err := strconv.ParseUint(val, 0, 64); err == nil { 152 l.alloc.BumpTo(uid) 153 return fmt.Sprintf("%#x", uid) 154 } 155 } 156 157 uid := l.alloc.AssignUid(val) 158 return fmt.Sprintf("%#x", uint64(uid)) 159 } 160 161 // processFile forwards a file to the RDF or JSON processor as appropriate 162 func (l *loader) processFile(ctx context.Context, filename string) error { 163 fmt.Printf("Processing data file %q\n", filename) 164 165 rd, cleanup := chunker.FileReader(filename) 166 defer cleanup() 167 168 loadType := chunker.DataFormat(filename, opt.dataFormat) 169 if loadType == chunker.UnknownFormat { 170 if isJson, err := chunker.IsJSONData(rd); err == nil { 171 if isJson { 172 loadType = chunker.JsonFormat 173 } else { 174 return errors.Errorf("need --format=rdf or --format=json to load %s", filename) 175 } 176 } 177 } 178 179 return l.processLoadFile(ctx, rd, chunker.NewChunker(loadType, opt.batchSize)) 180 } 181 182 func (l *loader) processLoadFile(ctx context.Context, rd *bufio.Reader, ck chunker.Chunker) error { 183 var wg sync.WaitGroup 184 wg.Add(1) 185 nqbuf := ck.NQuads() 186 // Spin a goroutine to push NQuads to mutation channel. 187 go func() { 188 defer wg.Done() 189 for nqs := range nqbuf.Ch() { 190 if len(nqs) == 0 { 191 continue 192 } 193 for _, nq := range nqs { 194 nq.Subject = l.uid(nq.Subject) 195 if len(nq.ObjectId) > 0 { 196 nq.ObjectId = l.uid(nq.ObjectId) 197 } 198 } 199 200 mu := api.Mutation{Set: nqs} 201 l.reqs <- mu 202 } 203 }() 204 205 for { 206 select { 207 case <-ctx.Done(): 208 return ctx.Err() 209 default: 210 } 211 212 chunkBuf, err := ck.Chunk(rd) 213 // Parses the rdf entries from the chunk, groups them into batches (each one 214 // containing opt.batchSize entries) and sends the batches to the loader.reqs channel (see 215 // above). 216 if oerr := ck.Parse(chunkBuf); oerr != nil { 217 return errors.Wrap(oerr, "During parsing chunk in processLoadFile") 218 } 219 if err == io.EOF { 220 break 221 } else { 222 x.Check(err) 223 } 224 } 225 nqbuf.Flush() 226 wg.Wait() 227 228 return nil 229 } 230 231 func setup(opts batchMutationOptions, dc *dgo.Dgraph) *loader { 232 var db *badger.DB 233 if len(opt.clientDir) > 0 { 234 x.Check(os.MkdirAll(opt.clientDir, 0700)) 235 236 var err error 237 db, err = badger.Open(badger.DefaultOptions(opt.clientDir). 238 WithTableLoadingMode(bopt.MemoryMap). 239 WithSyncWrites(false)) 240 x.Checkf(err, "Error while creating badger KV posting store") 241 } 242 243 // compression with zero server actually makes things worse 244 connzero, err := x.SetupConnection(opt.zero, tlsCfg, false) 245 x.Checkf(err, "Unable to connect to zero, Is it running at %s?", opt.zero) 246 247 alloc := xidmap.New(connzero, db) 248 l := &loader{ 249 opts: opts, 250 dc: dc, 251 start: time.Now(), 252 reqs: make(chan api.Mutation, opts.Pending*2), 253 alloc: alloc, 254 db: db, 255 zeroconn: connzero, 256 } 257 258 l.requestsWg.Add(opts.Pending) 259 for i := 0; i < opts.Pending; i++ { 260 go l.makeRequests() 261 } 262 263 rand.Seed(time.Now().Unix()) 264 return l 265 } 266 267 func run() error { 268 x.PrintVersion() 269 opt = options{ 270 dataFiles: Live.Conf.GetString("files"), 271 dataFormat: Live.Conf.GetString("format"), 272 schemaFile: Live.Conf.GetString("schema"), 273 zero: Live.Conf.GetString("zero"), 274 concurrent: Live.Conf.GetInt("conc"), 275 batchSize: Live.Conf.GetInt("batch"), 276 clientDir: Live.Conf.GetString("xidmap"), 277 authToken: Live.Conf.GetString("auth_token"), 278 useCompression: Live.Conf.GetBool("use_compression"), 279 newUids: Live.Conf.GetBool("new_uids"), 280 verbose: Live.Conf.GetBool("verbose"), 281 } 282 go func() { 283 if err := http.ListenAndServe("localhost:6060", nil); err != nil { 284 glog.Errorf("Error while starting HTTP server in port 6060: %+v", err) 285 } 286 }() 287 ctx := context.Background() 288 bmOpts := batchMutationOptions{ 289 Size: opt.batchSize, 290 Pending: opt.concurrent, 291 PrintCounters: true, 292 Ctx: ctx, 293 MaxRetries: math.MaxUint32, 294 } 295 296 dg, closeFunc := x.GetDgraphClient(Live.Conf, true) 297 defer closeFunc() 298 299 l := setup(bmOpts, dg) 300 defer l.zeroconn.Close() 301 302 if len(opt.schemaFile) > 0 { 303 if err := processSchemaFile(ctx, opt.schemaFile, dg); err != nil { 304 if err == context.Canceled { 305 fmt.Printf("Interrupted while processing schema file %q\n", opt.schemaFile) 306 return nil 307 } 308 fmt.Printf("Error while processing schema file %q: %s\n", opt.schemaFile, err) 309 return err 310 } 311 fmt.Printf("Processed schema file %q\n\n", opt.schemaFile) 312 } 313 314 if opt.dataFiles == "" { 315 return errors.New("RDF or JSON file(s) location must be specified") 316 } 317 318 filesList := x.FindDataFiles(opt.dataFiles, []string{".rdf", ".rdf.gz", ".json", ".json.gz"}) 319 totalFiles := len(filesList) 320 if totalFiles == 0 { 321 return errors.Errorf("No data files found in %s", opt.dataFiles) 322 } 323 fmt.Printf("Found %d data file(s) to process\n", totalFiles) 324 325 // x.Check(dgraphClient.NewSyncMarks(filesList)) 326 errCh := make(chan error, totalFiles) 327 for _, file := range filesList { 328 file = strings.Trim(file, " \t") 329 go func(file string) { 330 errCh <- l.processFile(ctx, file) 331 }(file) 332 } 333 334 // PrintCounters should be called after schema has been updated. 335 if bmOpts.PrintCounters { 336 go l.printCounters() 337 } 338 339 for i := 0; i < totalFiles; i++ { 340 if err := <-errCh; err != nil { 341 fmt.Printf("Error while processing data file %q: %s\n", filesList[i], err) 342 return err 343 } 344 } 345 346 close(l.reqs) 347 // First we wait for requestsWg, when it is done we know all retry requests have been added 348 // to retryRequestsWg. We can't have the same waitgroup as by the time we call Wait, we can't 349 // be sure that all retry requests have been added to the waitgroup. 350 l.requestsWg.Wait() 351 l.retryRequestsWg.Wait() 352 c := l.Counter() 353 var rate uint64 354 if c.Elapsed.Seconds() < 1 { 355 rate = c.Nquads 356 } else { 357 rate = c.Nquads / uint64(c.Elapsed.Seconds()) 358 } 359 // Lets print an empty line, otherwise Interrupted or Number of Mutations overwrites the 360 // previous printed line. 361 fmt.Printf("%100s\r", "") 362 fmt.Printf("Number of TXs run : %d\n", c.TxnsDone) 363 fmt.Printf("Number of N-Quads processed : %d\n", c.Nquads) 364 fmt.Printf("Time spent : %v\n", c.Elapsed) 365 fmt.Printf("N-Quads processed per second : %d\n", rate) 366 367 if l.db != nil { 368 l.alloc.Flush() 369 l.db.Close() 370 } 371 return nil 372 }