github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/dgraph/cmd/bulk/mapper.go

github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/dgraph/cmd/bulk/mapper.go (about)

     1  /*
     2   * Copyright 2017-2018 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package bulk
    18  
    19  import (
    20  	"bufio"
    21  	"bytes"
    22  	"compress/gzip"
    23  	"encoding/binary"
    24  	"fmt"
    25  	"log"
    26  	"math"
    27  	"os"
    28  	"path/filepath"
    29  	"sort"
    30  	"strconv"
    31  	"strings"
    32  	"sync"
    33  	"sync/atomic"
    34  
    35  	"github.com/dgraph-io/dgo/protos/api"
    36  	"github.com/dgraph-io/dgraph/chunker"
    37  	"github.com/dgraph-io/dgraph/gql"
    38  	"github.com/dgraph-io/dgraph/posting"
    39  	"github.com/dgraph-io/dgraph/protos/pb"
    40  	"github.com/dgraph-io/dgraph/tok"
    41  	"github.com/dgraph-io/dgraph/types"
    42  	"github.com/dgraph-io/dgraph/types/facets"
    43  	"github.com/dgraph-io/dgraph/x"
    44  	farm "github.com/dgryski/go-farm"
    45  )
    46  
    47  type mapper struct {
    48  	*state
    49  	shards []shardState // shard is based on predicate
    50  	mePool *sync.Pool
    51  }
    52  
    53  type shardState struct {
    54  	// Buffer up map entries until we have a sufficient amount, then sort and
    55  	// write them to file.
    56  	entries     []*pb.MapEntry
    57  	encodedSize uint64
    58  	mu          sync.Mutex // Allow only 1 write per shard at a time.
    59  }
    60  
    61  func newMapper(st *state) *mapper {
    62  	return &mapper{
    63  		state:  st,
    64  		shards: make([]shardState, st.opt.MapShards),
    65  		mePool: &sync.Pool{
    66  			New: func() interface{} {
    67  				return &pb.MapEntry{}
    68  			},
    69  		},
    70  	}
    71  }
    72  
    73  func less(lhs, rhs *pb.MapEntry) bool {
    74  	if keyCmp := bytes.Compare(lhs.Key, rhs.Key); keyCmp != 0 {
    75  		return keyCmp < 0
    76  	}
    77  	lhsUID := lhs.Uid
    78  	rhsUID := rhs.Uid
    79  	if lhs.Posting != nil {
    80  		lhsUID = lhs.Posting.Uid
    81  	}
    82  	if rhs.Posting != nil {
    83  		rhsUID = rhs.Posting.Uid
    84  	}
    85  	return lhsUID < rhsUID
    86  }
    87  
    88  func (m *mapper) openOutputFile(shardIdx int) (*os.File, error) {
    89  	fileNum := atomic.AddUint32(&m.mapFileId, 1)
    90  	filename := filepath.Join(
    91  		m.opt.TmpDir,
    92  		"shards",
    93  		fmt.Sprintf("%03d", shardIdx),
    94  		fmt.Sprintf("%06d.map.gz", fileNum),
    95  	)
    96  	x.Check(os.MkdirAll(filepath.Dir(filename), 0755))
    97  	return os.OpenFile(filename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
    98  }
    99  
   100  func (m *mapper) writeMapEntriesToFile(entries []*pb.MapEntry, encodedSize uint64, shardIdx int) {
   101  	defer m.shards[shardIdx].mu.Unlock() // Locked by caller.
   102  
   103  	sort.Slice(entries, func(i, j int) bool {
   104  		return less(entries[i], entries[j])
   105  	})
   106  
   107  	f, err := m.openOutputFile(shardIdx)
   108  	x.Check(err)
   109  
   110  	defer func() {
   111  		x.Check(f.Sync())
   112  		x.Check(f.Close())
   113  	}()
   114  
   115  	gzWriter := gzip.NewWriter(f)
   116  	w := bufio.NewWriter(gzWriter)
   117  	defer func() {
   118  		x.Check(w.Flush())
   119  		x.Check(gzWriter.Flush())
   120  		x.Check(gzWriter.Close())
   121  	}()
   122  
   123  	sizeBuf := make([]byte, binary.MaxVarintLen64)
   124  	for _, me := range entries {
   125  		n := binary.PutUvarint(sizeBuf, uint64(me.Size()))
   126  		_, err := w.Write(sizeBuf[:n])
   127  		x.Check(err)
   128  
   129  		meBuf, err := me.Marshal()
   130  		x.Check(err)
   131  		_, err = w.Write(meBuf)
   132  		x.Check(err)
   133  		m.mePool.Put(me)
   134  	}
   135  }
   136  
   137  func (m *mapper) run(inputFormat chunker.InputFormat) {
   138  	chunker := chunker.NewChunker(inputFormat, 1000)
   139  	nquads := chunker.NQuads()
   140  	go func() {
   141  		for chunkBuf := range m.readerChunkCh {
   142  			if err := chunker.Parse(chunkBuf); err != nil {
   143  				atomic.AddInt64(&m.prog.errCount, 1)
   144  				if !m.opt.IgnoreErrors {
   145  					x.Check(err)
   146  				}
   147  			}
   148  		}
   149  		nquads.Flush()
   150  	}()
   151  
   152  	for nqs := range nquads.Ch() {
   153  		for _, nq := range nqs {
   154  			if err := facets.SortAndValidate(nq.Facets); err != nil {
   155  				atomic.AddInt64(&m.prog.errCount, 1)
   156  				if !m.opt.IgnoreErrors {
   157  					x.Check(err)
   158  				}
   159  			}
   160  
   161  			m.processNQuad(gql.NQuad{NQuad: nq})
   162  			atomic.AddInt64(&m.prog.nquadCount, 1)
   163  		}
   164  
   165  		for i := range m.shards {
   166  			sh := &m.shards[i]
   167  			if sh.encodedSize >= m.opt.MapBufSize {
   168  				sh.mu.Lock() // One write at a time.
   169  				go m.writeMapEntriesToFile(sh.entries, sh.encodedSize, i)
   170  				// Clear the entries and encodedSize for the next batch.
   171  				// Proactively allocate 32 slots to bootstrap the entries slice.
   172  				sh.entries = make([]*pb.MapEntry, 0, 32)
   173  				sh.encodedSize = 0
   174  			}
   175  		}
   176  	}
   177  
   178  	for i := range m.shards {
   179  		sh := &m.shards[i]
   180  		if len(sh.entries) > 0 {
   181  			sh.mu.Lock() // One write at a time.
   182  			m.writeMapEntriesToFile(sh.entries, sh.encodedSize, i)
   183  		}
   184  		m.shards[i].mu.Lock() // Ensure that the last file write finishes.
   185  	}
   186  }
   187  
   188  func (m *mapper) addMapEntry(key []byte, p *pb.Posting, shard int) {
   189  	atomic.AddInt64(&m.prog.mapEdgeCount, 1)
   190  
   191  	me := m.mePool.Get().(*pb.MapEntry)
   192  	*me = pb.MapEntry{Key: key}
   193  
   194  	if p.PostingType != pb.Posting_REF || len(p.Facets) > 0 {
   195  		me.Posting = p
   196  	} else {
   197  		me.Uid = p.Uid
   198  	}
   199  	sh := &m.shards[shard]
   200  
   201  	var err error
   202  	sh.entries = append(sh.entries, me)
   203  	sh.encodedSize += uint64(me.Size())
   204  	x.Check(err)
   205  }
   206  
   207  func (m *mapper) processNQuad(nq gql.NQuad) {
   208  	sid := m.uid(nq.GetSubject())
   209  	var oid uint64
   210  	var de *pb.DirectedEdge
   211  	if nq.GetObjectValue() == nil {
   212  		oid = m.uid(nq.GetObjectId())
   213  		de = nq.CreateUidEdge(sid, oid)
   214  	} else {
   215  		var err error
   216  		de, err = nq.CreateValueEdge(sid)
   217  		x.Check(err)
   218  	}
   219  
   220  	fwd, rev := m.createPostings(nq, de)
   221  	shard := m.state.shards.shardFor(nq.Predicate)
   222  	key := x.DataKey(nq.Predicate, sid)
   223  	m.addMapEntry(key, fwd, shard)
   224  
   225  	if rev != nil {
   226  		key = x.ReverseKey(nq.Predicate, oid)
   227  		m.addMapEntry(key, rev, shard)
   228  	}
   229  	m.addIndexMapEntries(nq, de)
   230  }
   231  
   232  func (m *mapper) uid(xid string) uint64 {
   233  	if !m.opt.NewUids {
   234  		if uid, err := strconv.ParseUint(xid, 0, 64); err == nil {
   235  			m.xids.BumpTo(uid)
   236  			return uid
   237  		}
   238  	}
   239  
   240  	return m.lookupUid(xid)
   241  }
   242  
   243  func (m *mapper) lookupUid(xid string) uint64 {
   244  	uid := m.xids.AssignUid(xid)
   245  	if !m.opt.StoreXids {
   246  		return uid
   247  	}
   248  	if strings.HasPrefix(xid, "_:") {
   249  		// Don't store xids for blank nodes.
   250  		return uid
   251  	}
   252  	nq := gql.NQuad{NQuad: &api.NQuad{
   253  		Subject:   xid,
   254  		Predicate: "xid",
   255  		ObjectValue: &api.Value{
   256  			Val: &api.Value_StrVal{StrVal: xid},
   257  		},
   258  	}}
   259  	m.processNQuad(nq)
   260  	return uid
   261  }
   262  
   263  func (m *mapper) createPostings(nq gql.NQuad,
   264  	de *pb.DirectedEdge) (*pb.Posting, *pb.Posting) {
   265  
   266  	m.schema.validateType(de, nq.ObjectValue == nil)
   267  
   268  	p := posting.NewPosting(de)
   269  	sch := m.schema.getSchema(nq.GetPredicate())
   270  	if nq.GetObjectValue() != nil {
   271  		if lang := de.GetLang(); len(lang) > 0 {
   272  			p.Uid = farm.Fingerprint64([]byte(lang))
   273  		} else if sch.List {
   274  			p.Uid = farm.Fingerprint64(de.Value)
   275  		} else {
   276  			p.Uid = math.MaxUint64
   277  		}
   278  	}
   279  	p.Facets = nq.Facets
   280  
   281  	// Early exit for no reverse edge.
   282  	if sch.GetDirective() != pb.SchemaUpdate_REVERSE {
   283  		return p, nil
   284  	}
   285  
   286  	// Reverse predicate
   287  	x.AssertTruef(nq.GetObjectValue() == nil, "only has reverse schema if object is UID")
   288  	de.Entity, de.ValueId = de.ValueId, de.Entity
   289  	m.schema.validateType(de, true)
   290  	rp := posting.NewPosting(de)
   291  
   292  	de.Entity, de.ValueId = de.ValueId, de.Entity // de reused so swap back.
   293  
   294  	return p, rp
   295  }
   296  
   297  func (m *mapper) addIndexMapEntries(nq gql.NQuad, de *pb.DirectedEdge) {
   298  	if nq.GetObjectValue() == nil {
   299  		return // Cannot index UIDs
   300  	}
   301  
   302  	sch := m.schema.getSchema(nq.GetPredicate())
   303  	for _, tokerName := range sch.GetTokenizer() {
   304  		// Find tokeniser.
   305  		toker, ok := tok.GetTokenizer(tokerName)
   306  		if !ok {
   307  			log.Fatalf("unknown tokenizer %q", tokerName)
   308  		}
   309  
   310  		// Create storage value.
   311  		storageVal := types.Val{
   312  			Tid:   types.TypeID(de.GetValueType()),
   313  			Value: de.GetValue(),
   314  		}
   315  
   316  		// Convert from storage type to schema type.
   317  		schemaVal, err := types.Convert(storageVal, types.TypeID(sch.GetValueType()))
   318  		// Shouldn't error, since we've already checked for convertibility when
   319  		// doing edge postings. So okay to be fatal.
   320  		x.Check(err)
   321  
   322  		// Extract tokens.
   323  		toks, err := tok.BuildTokens(schemaVal.Value, tok.GetLangTokenizer(toker, nq.Lang))
   324  		x.Check(err)
   325  
   326  		// Store index posting.
   327  		for _, t := range toks {
   328  			m.addMapEntry(
   329  				x.IndexKey(nq.Predicate, t),
   330  				&pb.Posting{
   331  					Uid:         de.GetEntity(),
   332  					PostingType: pb.Posting_REF,
   333  				},
   334  				m.state.shards.shardFor(nq.Predicate),
   335  			)
   336  		}
   337  	}
   338  }