github.com/dshekhar95/sub_dgraph@v0.0.0-20230424164411-6be28e40bbf1/dgraph/cmd/bulk/mapper.go (about)

     1  /*
     2   * Copyright 2017-2022 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package bulk
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/binary"
    22  	"fmt"
    23  	"log"
    24  	"math"
    25  	"os"
    26  	"path/filepath"
    27  	"strconv"
    28  	"strings"
    29  	"sync"
    30  	"sync/atomic"
    31  
    32  	farm "github.com/dgryski/go-farm"
    33  	"github.com/golang/glog"
    34  	"github.com/golang/snappy"
    35  
    36  	"github.com/dgraph-io/dgo/v210/protos/api"
    37  	"github.com/dgraph-io/dgraph/chunker"
    38  	"github.com/dgraph-io/dgraph/dql"
    39  	"github.com/dgraph-io/dgraph/ee/acl"
    40  	"github.com/dgraph-io/dgraph/posting"
    41  	"github.com/dgraph-io/dgraph/protos/pb"
    42  	"github.com/dgraph-io/dgraph/tok"
    43  	"github.com/dgraph-io/dgraph/types"
    44  	"github.com/dgraph-io/dgraph/types/facets"
    45  	"github.com/dgraph-io/dgraph/x"
    46  	"github.com/dgraph-io/ristretto/z"
    47  )
    48  
    49  var (
    50  	aclOnce sync.Once
    51  )
    52  
    53  type mapper struct {
    54  	*state
    55  	shards []shardState // shard is based on predicate
    56  }
    57  
    58  type shardState struct {
    59  	// Buffer up map entries until we have a sufficient amount, then sort and
    60  	// write them to file.
    61  	cbuf *z.Buffer
    62  	mu   sync.Mutex // Allow only 1 write per shard at a time.
    63  }
    64  
    65  func newMapperBuffer(opt *options) *z.Buffer {
    66  	sz := float64(opt.MapBufSize) * 1.1
    67  	tmpDir := filepath.Join(opt.TmpDir, bufferDir)
    68  	buf, err := z.NewBufferTmp(tmpDir, int(sz))
    69  	x.Check(err)
    70  	return buf.WithMaxSize(2 * int(opt.MapBufSize))
    71  }
    72  
    73  func newMapper(st *state) *mapper {
    74  	shards := make([]shardState, st.opt.MapShards)
    75  	for i := range shards {
    76  		shards[i].cbuf = newMapperBuffer(st.opt)
    77  	}
    78  	return &mapper{
    79  		state:  st,
    80  		shards: shards,
    81  	}
    82  }
    83  
    84  type MapEntry []byte
    85  
    86  // type mapEntry struct {
    87  // 	uid   uint64 // if plist is filled, then corresponds to plist's uid.
    88  // 	key   []byte
    89  // 	plist []byte
    90  // }
    91  
    92  func mapEntrySize(key []byte, p *pb.Posting) int {
    93  	return 8 + 4 + 4 + len(key) + p.Size() // UID + keySz + postingSz + len(key) + size(p)
    94  }
    95  
    96  func marshalMapEntry(dst []byte, uid uint64, key []byte, p *pb.Posting) {
    97  	if p != nil {
    98  		uid = p.Uid
    99  	}
   100  	binary.BigEndian.PutUint64(dst[0:8], uid)
   101  	binary.BigEndian.PutUint32(dst[8:12], uint32(len(key)))
   102  
   103  	psz := p.Size()
   104  	binary.BigEndian.PutUint32(dst[12:16], uint32(psz))
   105  
   106  	n := copy(dst[16:], key)
   107  
   108  	if psz > 0 {
   109  		pbuf := dst[16+n:]
   110  		_, err := p.MarshalToSizedBuffer(pbuf[:psz])
   111  		x.Check(err)
   112  	}
   113  
   114  	x.AssertTrue(len(dst) == 16+n+psz)
   115  }
   116  
   117  func (me MapEntry) Size() int {
   118  	return len(me)
   119  }
   120  
   121  func (me MapEntry) Uid() uint64 {
   122  	return binary.BigEndian.Uint64(me[0:8])
   123  }
   124  
   125  func (me MapEntry) Key() []byte {
   126  	sz := binary.BigEndian.Uint32(me[8:12])
   127  	return me[16 : 16+sz]
   128  }
   129  
   130  func (me MapEntry) Plist() []byte {
   131  	ksz := binary.BigEndian.Uint32(me[8:12])
   132  	sz := binary.BigEndian.Uint32(me[12:16])
   133  	start := 16 + ksz
   134  	return me[start : start+sz]
   135  }
   136  
   137  func less(lhs, rhs MapEntry) bool {
   138  	if keyCmp := bytes.Compare(lhs.Key(), rhs.Key()); keyCmp != 0 {
   139  		return keyCmp < 0
   140  	}
   141  	return lhs.Uid() < rhs.Uid()
   142  }
   143  
   144  func (m *mapper) openOutputFile(shardIdx int) (*os.File, error) {
   145  	fileNum := atomic.AddUint32(&m.mapFileId, 1)
   146  	filename := filepath.Join(
   147  		m.opt.TmpDir,
   148  		mapShardDir,
   149  		fmt.Sprintf("%03d", shardIdx),
   150  		fmt.Sprintf("%06d.map.gz", fileNum),
   151  	)
   152  	x.Check(os.MkdirAll(filepath.Dir(filename), 0750))
   153  	return os.OpenFile(filename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0600)
   154  }
   155  
   156  func (m *mapper) writeMapEntriesToFile(cbuf *z.Buffer, shardIdx int) {
   157  	defer func() {
   158  		m.shards[shardIdx].mu.Unlock() // Locked by caller.
   159  		if err := cbuf.Release(); err != nil {
   160  			glog.Warningf("error in releasing buffer: %v", err)
   161  		}
   162  	}()
   163  
   164  	cbuf.SortSlice(func(ls, rs []byte) bool {
   165  		lhs := MapEntry(ls)
   166  		rhs := MapEntry(rs)
   167  		return less(lhs, rhs)
   168  	})
   169  
   170  	f, err := m.openOutputFile(shardIdx)
   171  	x.Check(err)
   172  
   173  	defer func() {
   174  		x.Check(f.Sync())
   175  		x.Check(f.Close())
   176  	}()
   177  
   178  	w := snappy.NewBufferedWriter(f)
   179  	defer func() {
   180  		x.Check(w.Close())
   181  	}()
   182  
   183  	// Create partition keys for the map file.
   184  	header := &pb.MapHeader{
   185  		PartitionKeys: [][]byte{},
   186  	}
   187  
   188  	var bufSize int64
   189  	if err := cbuf.SliceIterate(func(slice []byte) error {
   190  		me := MapEntry(slice)
   191  		bufSize += int64(4 + len(me))
   192  		if bufSize < m.opt.PartitionBufSize {
   193  			return nil
   194  		}
   195  		sz := len(header.PartitionKeys)
   196  		if sz > 0 && bytes.Equal(me.Key(), header.PartitionKeys[sz-1]) {
   197  			// We already have this key.
   198  			return nil
   199  		}
   200  		header.PartitionKeys = append(header.PartitionKeys, me.Key())
   201  		bufSize = 0
   202  		return nil
   203  	}); err != nil {
   204  		glog.Errorf("error while iterating over buf: %v", err)
   205  		x.Check(err)
   206  	}
   207  
   208  	// Write the header to the map file.
   209  	headerBuf, err := header.Marshal()
   210  	x.Check(err)
   211  	lenBuf := make([]byte, 4)
   212  	binary.BigEndian.PutUint32(lenBuf, uint32(len(headerBuf)))
   213  	x.Check2(w.Write(lenBuf))
   214  	x.Check2(w.Write(headerBuf))
   215  	x.Check(err)
   216  
   217  	sizeBuf := make([]byte, binary.MaxVarintLen64)
   218  
   219  	err = cbuf.SliceIterate(func(slice []byte) error {
   220  		n := binary.PutUvarint(sizeBuf, uint64(len(slice)))
   221  		_, err := w.Write(sizeBuf[:n])
   222  		x.Check(err)
   223  
   224  		_, err = w.Write(slice)
   225  		return err
   226  	})
   227  	x.Check(err)
   228  }
   229  
   230  func (m *mapper) run(inputFormat chunker.InputFormat) {
   231  	chunk := chunker.NewChunker(inputFormat, 1000)
   232  	nquads := chunk.NQuads()
   233  	go func() {
   234  		for chunkBuf := range m.readerChunkCh {
   235  			if err := chunk.Parse(chunkBuf); err != nil {
   236  				atomic.AddInt64(&m.prog.errCount, 1)
   237  				if !m.opt.IgnoreErrors {
   238  					x.Check(err)
   239  				}
   240  			}
   241  		}
   242  		aclOnce.Do(func() {
   243  			if m.opt.Namespace != math.MaxUint64 && m.opt.Namespace != x.GalaxyNamespace {
   244  				// Insert ACL related RDFs force uploading the data into non-galaxy namespace.
   245  				aclNquads := make([]*api.NQuad, 0)
   246  				aclNquads = append(aclNquads, acl.CreateGroupNQuads(x.GuardiansId)...)
   247  				aclNquads = append(aclNquads, acl.CreateUserNQuads(x.GrootId, "password")...)
   248  				aclNquads = append(aclNquads, &api.NQuad{
   249  					Subject:   "_:newuser",
   250  					Predicate: "dgraph.user.group",
   251  					ObjectId:  "_:newgroup",
   252  				})
   253  				nquads.Push(aclNquads...)
   254  			}
   255  		})
   256  		nquads.Flush()
   257  	}()
   258  
   259  	for nqs := range nquads.Ch() {
   260  		for _, nq := range nqs {
   261  			if err := facets.SortAndValidate(nq.Facets); err != nil {
   262  				atomic.AddInt64(&m.prog.errCount, 1)
   263  				if !m.opt.IgnoreErrors {
   264  					x.Check(err)
   265  				}
   266  			}
   267  
   268  			m.processNQuad(dql.NQuad{NQuad: nq})
   269  			atomic.AddInt64(&m.prog.nquadCount, 1)
   270  		}
   271  
   272  		for i := range m.shards {
   273  			sh := &m.shards[i]
   274  			if uint64(sh.cbuf.LenNoPadding()) >= m.opt.MapBufSize {
   275  				sh.mu.Lock() // One write at a time.
   276  				go m.writeMapEntriesToFile(sh.cbuf, i)
   277  				// Clear the entries and encodedSize for the next batch.
   278  				// Proactively allocate 32 slots to bootstrap the entries slice.
   279  				sh.cbuf = newMapperBuffer(m.opt)
   280  			}
   281  		}
   282  	}
   283  
   284  	for i := range m.shards {
   285  		sh := &m.shards[i]
   286  		if sh.cbuf.LenNoPadding() > 0 {
   287  			sh.mu.Lock() // One write at a time.
   288  			m.writeMapEntriesToFile(sh.cbuf, i)
   289  		} else {
   290  			if err := sh.cbuf.Release(); err != nil {
   291  				glog.Warningf("error in releasing buffer: %v", err)
   292  			}
   293  		}
   294  		m.shards[i].mu.Lock() // Ensure that the last file write finishes.
   295  	}
   296  }
   297  
   298  func (m *mapper) addMapEntry(key []byte, p *pb.Posting, shard int) {
   299  	atomic.AddInt64(&m.prog.mapEdgeCount, 1)
   300  
   301  	uid := p.Uid
   302  	if p.PostingType != pb.Posting_REF || len(p.Facets) > 0 {
   303  		// Keep p
   304  	} else {
   305  		// We only needed the UID.
   306  		p = nil
   307  	}
   308  
   309  	sh := &m.shards[shard]
   310  
   311  	sz := mapEntrySize(key, p)
   312  	dst := sh.cbuf.SliceAllocate(sz)
   313  	marshalMapEntry(dst, uid, key, p)
   314  }
   315  
   316  func (m *mapper) processNQuad(nq dql.NQuad) {
   317  	if m.opt.Namespace != math.MaxUint64 {
   318  		// Use the specified namespace passed through '--force-namespace' flag.
   319  		nq.Namespace = m.opt.Namespace
   320  	}
   321  	sid := m.uid(nq.GetSubject(), nq.Namespace)
   322  	if sid == 0 {
   323  		panic(fmt.Sprintf("invalid UID with value 0 for %v", nq.GetSubject()))
   324  	}
   325  	var oid uint64
   326  	var de *pb.DirectedEdge
   327  	if nq.GetObjectValue() == nil {
   328  		oid = m.uid(nq.GetObjectId(), nq.Namespace)
   329  		if oid == 0 {
   330  			panic(fmt.Sprintf("invalid UID with value 0 for %v", nq.GetObjectId()))
   331  		}
   332  		de = nq.CreateUidEdge(sid, oid)
   333  	} else {
   334  		var err error
   335  		de, err = nq.CreateValueEdge(sid)
   336  		x.Check(err)
   337  	}
   338  
   339  	m.schema.checkAndSetInitialSchema(nq.Namespace)
   340  
   341  	// Appropriate schema must exist for the nquad's namespace by this time.
   342  	de.Attr = x.NamespaceAttr(de.Namespace, de.Attr)
   343  	fwd, rev := m.createPostings(nq, de)
   344  	shard := m.state.shards.shardFor(de.Attr)
   345  	key := x.DataKey(de.Attr, sid)
   346  	m.addMapEntry(key, fwd, shard)
   347  
   348  	if rev != nil {
   349  		key = x.ReverseKey(de.Attr, oid)
   350  		m.addMapEntry(key, rev, shard)
   351  	}
   352  	m.addIndexMapEntries(nq, de)
   353  }
   354  
   355  func (m *mapper) uid(xid string, ns uint64) uint64 {
   356  	if !m.opt.NewUids {
   357  		if uid, err := strconv.ParseUint(xid, 0, 64); err == nil {
   358  			m.xids.BumpTo(uid)
   359  			return uid
   360  		}
   361  	}
   362  
   363  	return m.lookupUid(xid, ns)
   364  }
   365  
   366  func (m *mapper) lookupUid(xid string, ns uint64) uint64 {
   367  	// We create a copy of xid string here because it is stored in
   368  	// the map in AssignUid and going to be around throughout the process.
   369  	// We don't want to keep the whole line that we read from file alive.
   370  	// xid is a substring of the line that we read from the file and if
   371  	// xid is alive, the whole line is going to be alive and won't be GC'd.
   372  	// Also, checked that sb goes on the stack whereas sb.String() goes on
   373  	// heap. Note that the calls to the strings.Builder.* are inlined.
   374  
   375  	// With Trie, we no longer need to use strings.Builder, because Trie would use its own storage
   376  	// for the strings.
   377  	// sb := strings.Builder{}
   378  	// x.Check2(sb.WriteString(xid))
   379  	// uid, isNew := m.xids.AssignUid(sb.String())
   380  
   381  	// There might be a case where Nquad from different namespace have the same xid.
   382  	uid, isNew := m.xids.AssignUid(x.NamespaceAttr(ns, xid))
   383  	if !m.opt.StoreXids || !isNew {
   384  		return uid
   385  	}
   386  	if strings.HasPrefix(xid, "_:") {
   387  		// Don't store xids for blank nodes.
   388  		return uid
   389  	}
   390  	nq := dql.NQuad{NQuad: &api.NQuad{
   391  		Subject:   xid,
   392  		Predicate: "xid",
   393  		ObjectValue: &api.Value{
   394  			Val: &api.Value_StrVal{StrVal: xid},
   395  		},
   396  		Namespace: ns,
   397  	}}
   398  	m.processNQuad(nq)
   399  	return uid
   400  }
   401  
   402  func (m *mapper) createPostings(nq dql.NQuad,
   403  	de *pb.DirectedEdge) (*pb.Posting, *pb.Posting) {
   404  
   405  	m.schema.validateType(de, nq.ObjectValue == nil)
   406  
   407  	p := posting.NewPosting(de)
   408  	sch := m.schema.getSchema(x.NamespaceAttr(nq.GetNamespace(), nq.GetPredicate()))
   409  	if nq.GetObjectValue() != nil {
   410  		lang := de.GetLang()
   411  		switch {
   412  		case len(lang) > 0:
   413  			p.Uid = farm.Fingerprint64([]byte(lang))
   414  		case sch.List:
   415  			p.Uid = farm.Fingerprint64(de.Value)
   416  		default:
   417  			p.Uid = math.MaxUint64
   418  		}
   419  	}
   420  	p.Facets = nq.Facets
   421  
   422  	// Early exit for no reverse edge.
   423  	if sch.GetDirective() != pb.SchemaUpdate_REVERSE {
   424  		return p, nil
   425  	}
   426  
   427  	// Reverse predicate
   428  	x.AssertTruef(nq.GetObjectValue() == nil, "only has reverse schema if object is UID")
   429  	de.Entity, de.ValueId = de.ValueId, de.Entity
   430  	m.schema.validateType(de, true)
   431  	rp := posting.NewPosting(de)
   432  
   433  	de.Entity, de.ValueId = de.ValueId, de.Entity // de reused so swap back.
   434  
   435  	return p, rp
   436  }
   437  
   438  func (m *mapper) addIndexMapEntries(nq dql.NQuad, de *pb.DirectedEdge) {
   439  	if nq.GetObjectValue() == nil {
   440  		return // Cannot index UIDs
   441  	}
   442  
   443  	sch := m.schema.getSchema(x.NamespaceAttr(nq.GetNamespace(), nq.GetPredicate()))
   444  	for _, tokerName := range sch.GetTokenizer() {
   445  		// Find tokeniser.
   446  		toker, ok := tok.GetTokenizer(tokerName)
   447  		if !ok {
   448  			log.Fatalf("unknown tokenizer %q", tokerName)
   449  		}
   450  
   451  		// Create storage value.
   452  		storageVal := types.Val{
   453  			Tid:   types.TypeID(de.GetValueType()),
   454  			Value: de.GetValue(),
   455  		}
   456  
   457  		// Convert from storage type to schema type.
   458  		schemaVal, err := types.Convert(storageVal, types.TypeID(sch.GetValueType()))
   459  		// Shouldn't error, since we've already checked for convertibility when
   460  		// doing edge postings. So okay to be fatal.
   461  		x.Check(err)
   462  
   463  		// Extract tokens.
   464  		toks, err := tok.BuildTokens(schemaVal.Value, tok.GetTokenizerForLang(toker, nq.Lang))
   465  		x.Check(err)
   466  
   467  		attr := x.NamespaceAttr(nq.Namespace, nq.Predicate)
   468  		// Store index posting.
   469  		for _, t := range toks {
   470  			m.addMapEntry(
   471  				x.IndexKey(attr, t),
   472  				&pb.Posting{
   473  					Uid:         de.GetEntity(),
   474  					PostingType: pb.Posting_REF,
   475  				},
   476  				m.state.shards.shardFor(attr),
   477  			)
   478  		}
   479  	}
   480  }