github.com/dshekhar95/sub_dgraph@v0.0.0-20230424164411-6be28e40bbf1/dgraph/cmd/bulk/mapper.go (about) 1 /* 2 * Copyright 2017-2022 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package bulk 18 19 import ( 20 "bytes" 21 "encoding/binary" 22 "fmt" 23 "log" 24 "math" 25 "os" 26 "path/filepath" 27 "strconv" 28 "strings" 29 "sync" 30 "sync/atomic" 31 32 farm "github.com/dgryski/go-farm" 33 "github.com/golang/glog" 34 "github.com/golang/snappy" 35 36 "github.com/dgraph-io/dgo/v210/protos/api" 37 "github.com/dgraph-io/dgraph/chunker" 38 "github.com/dgraph-io/dgraph/dql" 39 "github.com/dgraph-io/dgraph/ee/acl" 40 "github.com/dgraph-io/dgraph/posting" 41 "github.com/dgraph-io/dgraph/protos/pb" 42 "github.com/dgraph-io/dgraph/tok" 43 "github.com/dgraph-io/dgraph/types" 44 "github.com/dgraph-io/dgraph/types/facets" 45 "github.com/dgraph-io/dgraph/x" 46 "github.com/dgraph-io/ristretto/z" 47 ) 48 49 var ( 50 aclOnce sync.Once 51 ) 52 53 type mapper struct { 54 *state 55 shards []shardState // shard is based on predicate 56 } 57 58 type shardState struct { 59 // Buffer up map entries until we have a sufficient amount, then sort and 60 // write them to file. 61 cbuf *z.Buffer 62 mu sync.Mutex // Allow only 1 write per shard at a time. 63 } 64 65 func newMapperBuffer(opt *options) *z.Buffer { 66 sz := float64(opt.MapBufSize) * 1.1 67 tmpDir := filepath.Join(opt.TmpDir, bufferDir) 68 buf, err := z.NewBufferTmp(tmpDir, int(sz)) 69 x.Check(err) 70 return buf.WithMaxSize(2 * int(opt.MapBufSize)) 71 } 72 73 func newMapper(st *state) *mapper { 74 shards := make([]shardState, st.opt.MapShards) 75 for i := range shards { 76 shards[i].cbuf = newMapperBuffer(st.opt) 77 } 78 return &mapper{ 79 state: st, 80 shards: shards, 81 } 82 } 83 84 type MapEntry []byte 85 86 // type mapEntry struct { 87 // uid uint64 // if plist is filled, then corresponds to plist's uid. 88 // key []byte 89 // plist []byte 90 // } 91 92 func mapEntrySize(key []byte, p *pb.Posting) int { 93 return 8 + 4 + 4 + len(key) + p.Size() // UID + keySz + postingSz + len(key) + size(p) 94 } 95 96 func marshalMapEntry(dst []byte, uid uint64, key []byte, p *pb.Posting) { 97 if p != nil { 98 uid = p.Uid 99 } 100 binary.BigEndian.PutUint64(dst[0:8], uid) 101 binary.BigEndian.PutUint32(dst[8:12], uint32(len(key))) 102 103 psz := p.Size() 104 binary.BigEndian.PutUint32(dst[12:16], uint32(psz)) 105 106 n := copy(dst[16:], key) 107 108 if psz > 0 { 109 pbuf := dst[16+n:] 110 _, err := p.MarshalToSizedBuffer(pbuf[:psz]) 111 x.Check(err) 112 } 113 114 x.AssertTrue(len(dst) == 16+n+psz) 115 } 116 117 func (me MapEntry) Size() int { 118 return len(me) 119 } 120 121 func (me MapEntry) Uid() uint64 { 122 return binary.BigEndian.Uint64(me[0:8]) 123 } 124 125 func (me MapEntry) Key() []byte { 126 sz := binary.BigEndian.Uint32(me[8:12]) 127 return me[16 : 16+sz] 128 } 129 130 func (me MapEntry) Plist() []byte { 131 ksz := binary.BigEndian.Uint32(me[8:12]) 132 sz := binary.BigEndian.Uint32(me[12:16]) 133 start := 16 + ksz 134 return me[start : start+sz] 135 } 136 137 func less(lhs, rhs MapEntry) bool { 138 if keyCmp := bytes.Compare(lhs.Key(), rhs.Key()); keyCmp != 0 { 139 return keyCmp < 0 140 } 141 return lhs.Uid() < rhs.Uid() 142 } 143 144 func (m *mapper) openOutputFile(shardIdx int) (*os.File, error) { 145 fileNum := atomic.AddUint32(&m.mapFileId, 1) 146 filename := filepath.Join( 147 m.opt.TmpDir, 148 mapShardDir, 149 fmt.Sprintf("%03d", shardIdx), 150 fmt.Sprintf("%06d.map.gz", fileNum), 151 ) 152 x.Check(os.MkdirAll(filepath.Dir(filename), 0750)) 153 return os.OpenFile(filename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0600) 154 } 155 156 func (m *mapper) writeMapEntriesToFile(cbuf *z.Buffer, shardIdx int) { 157 defer func() { 158 m.shards[shardIdx].mu.Unlock() // Locked by caller. 159 if err := cbuf.Release(); err != nil { 160 glog.Warningf("error in releasing buffer: %v", err) 161 } 162 }() 163 164 cbuf.SortSlice(func(ls, rs []byte) bool { 165 lhs := MapEntry(ls) 166 rhs := MapEntry(rs) 167 return less(lhs, rhs) 168 }) 169 170 f, err := m.openOutputFile(shardIdx) 171 x.Check(err) 172 173 defer func() { 174 x.Check(f.Sync()) 175 x.Check(f.Close()) 176 }() 177 178 w := snappy.NewBufferedWriter(f) 179 defer func() { 180 x.Check(w.Close()) 181 }() 182 183 // Create partition keys for the map file. 184 header := &pb.MapHeader{ 185 PartitionKeys: [][]byte{}, 186 } 187 188 var bufSize int64 189 if err := cbuf.SliceIterate(func(slice []byte) error { 190 me := MapEntry(slice) 191 bufSize += int64(4 + len(me)) 192 if bufSize < m.opt.PartitionBufSize { 193 return nil 194 } 195 sz := len(header.PartitionKeys) 196 if sz > 0 && bytes.Equal(me.Key(), header.PartitionKeys[sz-1]) { 197 // We already have this key. 198 return nil 199 } 200 header.PartitionKeys = append(header.PartitionKeys, me.Key()) 201 bufSize = 0 202 return nil 203 }); err != nil { 204 glog.Errorf("error while iterating over buf: %v", err) 205 x.Check(err) 206 } 207 208 // Write the header to the map file. 209 headerBuf, err := header.Marshal() 210 x.Check(err) 211 lenBuf := make([]byte, 4) 212 binary.BigEndian.PutUint32(lenBuf, uint32(len(headerBuf))) 213 x.Check2(w.Write(lenBuf)) 214 x.Check2(w.Write(headerBuf)) 215 x.Check(err) 216 217 sizeBuf := make([]byte, binary.MaxVarintLen64) 218 219 err = cbuf.SliceIterate(func(slice []byte) error { 220 n := binary.PutUvarint(sizeBuf, uint64(len(slice))) 221 _, err := w.Write(sizeBuf[:n]) 222 x.Check(err) 223 224 _, err = w.Write(slice) 225 return err 226 }) 227 x.Check(err) 228 } 229 230 func (m *mapper) run(inputFormat chunker.InputFormat) { 231 chunk := chunker.NewChunker(inputFormat, 1000) 232 nquads := chunk.NQuads() 233 go func() { 234 for chunkBuf := range m.readerChunkCh { 235 if err := chunk.Parse(chunkBuf); err != nil { 236 atomic.AddInt64(&m.prog.errCount, 1) 237 if !m.opt.IgnoreErrors { 238 x.Check(err) 239 } 240 } 241 } 242 aclOnce.Do(func() { 243 if m.opt.Namespace != math.MaxUint64 && m.opt.Namespace != x.GalaxyNamespace { 244 // Insert ACL related RDFs force uploading the data into non-galaxy namespace. 245 aclNquads := make([]*api.NQuad, 0) 246 aclNquads = append(aclNquads, acl.CreateGroupNQuads(x.GuardiansId)...) 247 aclNquads = append(aclNquads, acl.CreateUserNQuads(x.GrootId, "password")...) 248 aclNquads = append(aclNquads, &api.NQuad{ 249 Subject: "_:newuser", 250 Predicate: "dgraph.user.group", 251 ObjectId: "_:newgroup", 252 }) 253 nquads.Push(aclNquads...) 254 } 255 }) 256 nquads.Flush() 257 }() 258 259 for nqs := range nquads.Ch() { 260 for _, nq := range nqs { 261 if err := facets.SortAndValidate(nq.Facets); err != nil { 262 atomic.AddInt64(&m.prog.errCount, 1) 263 if !m.opt.IgnoreErrors { 264 x.Check(err) 265 } 266 } 267 268 m.processNQuad(dql.NQuad{NQuad: nq}) 269 atomic.AddInt64(&m.prog.nquadCount, 1) 270 } 271 272 for i := range m.shards { 273 sh := &m.shards[i] 274 if uint64(sh.cbuf.LenNoPadding()) >= m.opt.MapBufSize { 275 sh.mu.Lock() // One write at a time. 276 go m.writeMapEntriesToFile(sh.cbuf, i) 277 // Clear the entries and encodedSize for the next batch. 278 // Proactively allocate 32 slots to bootstrap the entries slice. 279 sh.cbuf = newMapperBuffer(m.opt) 280 } 281 } 282 } 283 284 for i := range m.shards { 285 sh := &m.shards[i] 286 if sh.cbuf.LenNoPadding() > 0 { 287 sh.mu.Lock() // One write at a time. 288 m.writeMapEntriesToFile(sh.cbuf, i) 289 } else { 290 if err := sh.cbuf.Release(); err != nil { 291 glog.Warningf("error in releasing buffer: %v", err) 292 } 293 } 294 m.shards[i].mu.Lock() // Ensure that the last file write finishes. 295 } 296 } 297 298 func (m *mapper) addMapEntry(key []byte, p *pb.Posting, shard int) { 299 atomic.AddInt64(&m.prog.mapEdgeCount, 1) 300 301 uid := p.Uid 302 if p.PostingType != pb.Posting_REF || len(p.Facets) > 0 { 303 // Keep p 304 } else { 305 // We only needed the UID. 306 p = nil 307 } 308 309 sh := &m.shards[shard] 310 311 sz := mapEntrySize(key, p) 312 dst := sh.cbuf.SliceAllocate(sz) 313 marshalMapEntry(dst, uid, key, p) 314 } 315 316 func (m *mapper) processNQuad(nq dql.NQuad) { 317 if m.opt.Namespace != math.MaxUint64 { 318 // Use the specified namespace passed through '--force-namespace' flag. 319 nq.Namespace = m.opt.Namespace 320 } 321 sid := m.uid(nq.GetSubject(), nq.Namespace) 322 if sid == 0 { 323 panic(fmt.Sprintf("invalid UID with value 0 for %v", nq.GetSubject())) 324 } 325 var oid uint64 326 var de *pb.DirectedEdge 327 if nq.GetObjectValue() == nil { 328 oid = m.uid(nq.GetObjectId(), nq.Namespace) 329 if oid == 0 { 330 panic(fmt.Sprintf("invalid UID with value 0 for %v", nq.GetObjectId())) 331 } 332 de = nq.CreateUidEdge(sid, oid) 333 } else { 334 var err error 335 de, err = nq.CreateValueEdge(sid) 336 x.Check(err) 337 } 338 339 m.schema.checkAndSetInitialSchema(nq.Namespace) 340 341 // Appropriate schema must exist for the nquad's namespace by this time. 342 de.Attr = x.NamespaceAttr(de.Namespace, de.Attr) 343 fwd, rev := m.createPostings(nq, de) 344 shard := m.state.shards.shardFor(de.Attr) 345 key := x.DataKey(de.Attr, sid) 346 m.addMapEntry(key, fwd, shard) 347 348 if rev != nil { 349 key = x.ReverseKey(de.Attr, oid) 350 m.addMapEntry(key, rev, shard) 351 } 352 m.addIndexMapEntries(nq, de) 353 } 354 355 func (m *mapper) uid(xid string, ns uint64) uint64 { 356 if !m.opt.NewUids { 357 if uid, err := strconv.ParseUint(xid, 0, 64); err == nil { 358 m.xids.BumpTo(uid) 359 return uid 360 } 361 } 362 363 return m.lookupUid(xid, ns) 364 } 365 366 func (m *mapper) lookupUid(xid string, ns uint64) uint64 { 367 // We create a copy of xid string here because it is stored in 368 // the map in AssignUid and going to be around throughout the process. 369 // We don't want to keep the whole line that we read from file alive. 370 // xid is a substring of the line that we read from the file and if 371 // xid is alive, the whole line is going to be alive and won't be GC'd. 372 // Also, checked that sb goes on the stack whereas sb.String() goes on 373 // heap. Note that the calls to the strings.Builder.* are inlined. 374 375 // With Trie, we no longer need to use strings.Builder, because Trie would use its own storage 376 // for the strings. 377 // sb := strings.Builder{} 378 // x.Check2(sb.WriteString(xid)) 379 // uid, isNew := m.xids.AssignUid(sb.String()) 380 381 // There might be a case where Nquad from different namespace have the same xid. 382 uid, isNew := m.xids.AssignUid(x.NamespaceAttr(ns, xid)) 383 if !m.opt.StoreXids || !isNew { 384 return uid 385 } 386 if strings.HasPrefix(xid, "_:") { 387 // Don't store xids for blank nodes. 388 return uid 389 } 390 nq := dql.NQuad{NQuad: &api.NQuad{ 391 Subject: xid, 392 Predicate: "xid", 393 ObjectValue: &api.Value{ 394 Val: &api.Value_StrVal{StrVal: xid}, 395 }, 396 Namespace: ns, 397 }} 398 m.processNQuad(nq) 399 return uid 400 } 401 402 func (m *mapper) createPostings(nq dql.NQuad, 403 de *pb.DirectedEdge) (*pb.Posting, *pb.Posting) { 404 405 m.schema.validateType(de, nq.ObjectValue == nil) 406 407 p := posting.NewPosting(de) 408 sch := m.schema.getSchema(x.NamespaceAttr(nq.GetNamespace(), nq.GetPredicate())) 409 if nq.GetObjectValue() != nil { 410 lang := de.GetLang() 411 switch { 412 case len(lang) > 0: 413 p.Uid = farm.Fingerprint64([]byte(lang)) 414 case sch.List: 415 p.Uid = farm.Fingerprint64(de.Value) 416 default: 417 p.Uid = math.MaxUint64 418 } 419 } 420 p.Facets = nq.Facets 421 422 // Early exit for no reverse edge. 423 if sch.GetDirective() != pb.SchemaUpdate_REVERSE { 424 return p, nil 425 } 426 427 // Reverse predicate 428 x.AssertTruef(nq.GetObjectValue() == nil, "only has reverse schema if object is UID") 429 de.Entity, de.ValueId = de.ValueId, de.Entity 430 m.schema.validateType(de, true) 431 rp := posting.NewPosting(de) 432 433 de.Entity, de.ValueId = de.ValueId, de.Entity // de reused so swap back. 434 435 return p, rp 436 } 437 438 func (m *mapper) addIndexMapEntries(nq dql.NQuad, de *pb.DirectedEdge) { 439 if nq.GetObjectValue() == nil { 440 return // Cannot index UIDs 441 } 442 443 sch := m.schema.getSchema(x.NamespaceAttr(nq.GetNamespace(), nq.GetPredicate())) 444 for _, tokerName := range sch.GetTokenizer() { 445 // Find tokeniser. 446 toker, ok := tok.GetTokenizer(tokerName) 447 if !ok { 448 log.Fatalf("unknown tokenizer %q", tokerName) 449 } 450 451 // Create storage value. 452 storageVal := types.Val{ 453 Tid: types.TypeID(de.GetValueType()), 454 Value: de.GetValue(), 455 } 456 457 // Convert from storage type to schema type. 458 schemaVal, err := types.Convert(storageVal, types.TypeID(sch.GetValueType())) 459 // Shouldn't error, since we've already checked for convertibility when 460 // doing edge postings. So okay to be fatal. 461 x.Check(err) 462 463 // Extract tokens. 464 toks, err := tok.BuildTokens(schemaVal.Value, tok.GetTokenizerForLang(toker, nq.Lang)) 465 x.Check(err) 466 467 attr := x.NamespaceAttr(nq.Namespace, nq.Predicate) 468 // Store index posting. 469 for _, t := range toks { 470 m.addMapEntry( 471 x.IndexKey(attr, t), 472 &pb.Posting{ 473 Uid: de.GetEntity(), 474 PostingType: pb.Posting_REF, 475 }, 476 m.state.shards.shardFor(attr), 477 ) 478 } 479 } 480 }