github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/posting/list.go (about) 1 /* 2 * Copyright 2015-2018 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package posting 18 19 import ( 20 "bytes" 21 "context" 22 "log" 23 "math" 24 "sort" 25 "sync" 26 27 "github.com/dgryski/go-farm" 28 29 bpb "github.com/dgraph-io/badger/pb" 30 "github.com/dgraph-io/dgo/protos/api" 31 "github.com/dgraph-io/dgo/y" 32 "github.com/dgraph-io/dgraph/algo" 33 "github.com/dgraph-io/dgraph/codec" 34 "github.com/dgraph-io/dgraph/protos/pb" 35 "github.com/dgraph-io/dgraph/schema" 36 "github.com/dgraph-io/dgraph/types" 37 "github.com/dgraph-io/dgraph/types/facets" 38 "github.com/dgraph-io/dgraph/x" 39 "github.com/pkg/errors" 40 ) 41 42 var ( 43 // ErrRetry can be triggered if the posting list got deleted from memory due to a hard commit. 44 // In such a case, retry. 45 ErrRetry = errors.New("Temporary error. Please retry") 46 // ErrNoValue would be returned if no value was found in the posting list. 47 ErrNoValue = errors.New("No value found") 48 // ErrStopIteration is returned when an iteration is terminated early. 49 ErrStopIteration = errors.New("Stop iteration") 50 emptyPosting = &pb.Posting{} 51 maxListSize = mb / 2 52 ) 53 54 const ( 55 // Set means overwrite in mutation layer. It contributes 0 in Length. 56 Set uint32 = 0x01 57 // Del means delete in mutation layer. It contributes -1 in Length. 58 Del uint32 = 0x02 59 60 // BitSchemaPosting signals that the value stores a schema or type. 61 BitSchemaPosting byte = 0x01 62 // BitDeltaPosting signals that the value stores the delta of a posting list. 63 BitDeltaPosting byte = 0x04 64 // BitCompletePosting signals that the values stores a complete posting list. 65 BitCompletePosting byte = 0x08 66 // BitEmptyPosting signals that the value stores an empty posting list. 67 BitEmptyPosting byte = 0x10 68 ) 69 70 // List stores the in-memory representation of a posting list. 71 type List struct { 72 x.SafeMutex 73 key []byte 74 plist *pb.PostingList 75 mutationMap map[uint64]*pb.PostingList 76 minTs uint64 // commit timestamp of immutable layer, reject reads before this ts. 77 maxTs uint64 // max commit timestamp seen for this list. 78 } 79 80 func (l *List) maxVersion() uint64 { 81 l.RLock() 82 defer l.RUnlock() 83 return l.maxTs 84 } 85 86 type pIterator struct { 87 l *List 88 plist *pb.PostingList 89 uidPosting *pb.Posting 90 pidx int // index of postings 91 plen int 92 93 dec *codec.Decoder 94 uids []uint64 95 uidx int // Offset into the uids slice 96 97 afterUid uint64 98 splitIdx int 99 // The timestamp of a delete marker in the mutable layer. If this value is greater 100 // than zero, then the immutable posting list should not be traversed. 101 deleteBelowTs uint64 102 } 103 104 func (it *pIterator) init(l *List, afterUid, deleteBelowTs uint64) error { 105 if deleteBelowTs > 0 && deleteBelowTs <= l.minTs { 106 return errors.Errorf("deleteBelowTs (%d) must be greater than the minTs in the list (%d)", 107 deleteBelowTs, l.minTs) 108 } 109 110 it.l = l 111 it.splitIdx = it.selectInitialSplit(afterUid) 112 if len(it.l.plist.Splits) > 0 { 113 plist, err := l.readListPart(it.l.plist.Splits[it.splitIdx]) 114 if err != nil { 115 return err 116 } 117 it.plist = plist 118 } else { 119 it.plist = l.plist 120 } 121 122 it.afterUid = afterUid 123 it.deleteBelowTs = deleteBelowTs 124 125 it.uidPosting = &pb.Posting{} 126 it.dec = &codec.Decoder{Pack: it.plist.Pack} 127 it.uids = it.dec.Seek(it.afterUid, codec.SeekCurrent) 128 it.uidx = 0 129 130 it.plen = len(it.plist.Postings) 131 it.pidx = sort.Search(it.plen, func(idx int) bool { 132 p := it.plist.Postings[idx] 133 return it.afterUid < p.Uid 134 }) 135 return nil 136 } 137 138 func (it *pIterator) selectInitialSplit(afterUid uint64) int { 139 if afterUid == 0 { 140 return 0 141 } 142 143 for i, startUid := range it.l.plist.Splits { 144 // If startUid == afterUid, the current block should be selected. 145 if startUid == afterUid { 146 return i 147 } 148 // If this split starts at an UID greater than afterUid, there might be 149 // elements in the previous split that need to be checked. 150 if startUid > afterUid { 151 return i - 1 152 } 153 } 154 155 // In case no split's startUid is greater or equal than afterUid, start the 156 // iteration at the start of the last split. 157 return len(it.l.plist.Splits) - 1 158 } 159 160 // moveToNextPart re-initializes the iterator at the start of the next list part. 161 func (it *pIterator) moveToNextPart() error { 162 it.splitIdx++ 163 plist, err := it.l.readListPart(it.l.plist.Splits[it.splitIdx]) 164 if err != nil { 165 return err 166 } 167 it.plist = plist 168 169 it.dec = &codec.Decoder{Pack: it.plist.Pack} 170 // codec.SeekCurrent makes sure we skip returning afterUid during seek. 171 it.uids = it.dec.Seek(it.afterUid, codec.SeekCurrent) 172 it.uidx = 0 173 174 it.plen = len(it.plist.Postings) 175 it.pidx = sort.Search(it.plen, func(idx int) bool { 176 p := it.plist.Postings[idx] 177 return it.afterUid < p.Uid 178 }) 179 180 return nil 181 } 182 183 // moveToNextValidPart moves the iterator to the next part that contains valid data. 184 // This is used to skip over parts of the list that might not contain postings. 185 func (it *pIterator) moveToNextValidPart() error { 186 // Not a multi-part list, the iterator has reached the end of the list. 187 if len(it.l.plist.Splits) == 0 { 188 return nil 189 } 190 191 // If there are no more UIDs to iterate over, move to the next part of the 192 // list that contains valid data. 193 if len(it.uids) == 0 { 194 for it.splitIdx <= len(it.l.plist.Splits)-2 { 195 // moveToNextPart will increment it.splitIdx. Therefore, the for loop must only 196 // continue until len(splits) - 2. 197 if err := it.moveToNextPart(); err != nil { 198 return err 199 } 200 201 if len(it.uids) > 0 { 202 return nil 203 } 204 } 205 } 206 return nil 207 } 208 209 func (it *pIterator) next() error { 210 if it.deleteBelowTs > 0 { 211 it.uids = nil 212 return nil 213 } 214 215 it.uidx++ 216 if it.uidx < len(it.uids) { 217 return nil 218 } 219 it.uidx = 0 220 it.uids = it.dec.Next() 221 222 return it.moveToNextValidPart() 223 } 224 225 func (it *pIterator) valid() (bool, error) { 226 if len(it.uids) > 0 { 227 return true, nil 228 } 229 230 if err := it.moveToNextValidPart(); err != nil { 231 return false, err 232 } else if len(it.uids) > 0 { 233 return true, nil 234 } 235 return false, nil 236 } 237 238 func (it *pIterator) posting() *pb.Posting { 239 uid := it.uids[it.uidx] 240 241 for it.pidx < it.plen { 242 p := it.plist.Postings[it.pidx] 243 if p.Uid > uid { 244 break 245 } 246 if p.Uid == uid { 247 return p 248 } 249 it.pidx++ 250 } 251 it.uidPosting.Uid = uid 252 return it.uidPosting 253 } 254 255 // ListOptions is used in List.Uids (in posting) to customize our output list of 256 // UIDs, for each posting list. It should be pb.to this package. 257 type ListOptions struct { 258 ReadTs uint64 259 AfterUid uint64 // Any UIDs returned must be after this value. 260 Intersect *pb.List // Intersect results with this list of UIDs. 261 } 262 263 // NewPosting takes the given edge and returns its equivalent representation as a posting. 264 func NewPosting(t *pb.DirectedEdge) *pb.Posting { 265 var op uint32 266 if t.Op == pb.DirectedEdge_SET { 267 op = Set 268 } else if t.Op == pb.DirectedEdge_DEL { 269 op = Del 270 } else { 271 x.Fatalf("Unhandled operation: %+v", t) 272 } 273 274 var postingType pb.Posting_PostingType 275 if len(t.Lang) > 0 { 276 postingType = pb.Posting_VALUE_LANG 277 } else if t.ValueId == 0 { 278 postingType = pb.Posting_VALUE 279 } else { 280 postingType = pb.Posting_REF 281 } 282 283 p := postingPool.Get().(*pb.Posting) 284 *p = pb.Posting{ 285 Uid: t.ValueId, 286 Value: t.Value, 287 ValType: t.ValueType, 288 PostingType: postingType, 289 LangTag: []byte(t.Lang), 290 Label: t.Label, 291 Op: op, 292 Facets: t.Facets, 293 } 294 return p 295 } 296 297 func hasDeleteAll(mpost *pb.Posting) bool { 298 return mpost.Op == Del && bytes.Equal(mpost.Value, []byte(x.Star)) && len(mpost.LangTag) == 0 299 } 300 301 // Ensure that you either abort the uncommitted postings or commit them before calling me. 302 func (l *List) updateMutationLayer(mpost *pb.Posting) { 303 l.AssertLock() 304 x.AssertTrue(mpost.Op == Set || mpost.Op == Del) 305 306 // If we have a delete all, then we replace the map entry with just one. 307 if hasDeleteAll(mpost) { 308 plist := &pb.PostingList{} 309 plist.Postings = append(plist.Postings, mpost) 310 l.mutationMap[mpost.StartTs] = plist 311 return 312 } 313 314 plist, ok := l.mutationMap[mpost.StartTs] 315 if !ok { 316 plist := &pb.PostingList{} 317 plist.Postings = append(plist.Postings, mpost) 318 l.mutationMap[mpost.StartTs] = plist 319 return 320 } 321 // Even if we have a delete all in this transaction, we should still pick up any updates since. 322 for i, prev := range plist.Postings { 323 if prev.Uid == mpost.Uid { 324 plist.Postings[i] = mpost 325 return 326 } 327 } 328 plist.Postings = append(plist.Postings, mpost) 329 } 330 331 // TypeID returns the typeid of destination vertex 332 func TypeID(edge *pb.DirectedEdge) types.TypeID { 333 if edge.ValueId != 0 { 334 return types.UidID 335 } 336 return types.TypeID(edge.ValueType) 337 } 338 339 func fingerprintEdge(t *pb.DirectedEdge) uint64 { 340 // There could be a collision if the user gives us a value with Lang = "en" and later gives 341 // us a value = "en" for the same predicate. We would end up overwritting his older lang 342 // value. 343 344 // All edges with a value without LANGTAG, have the same UID. In other words, 345 // an (entity, attribute) can only have one untagged value. 346 var id uint64 = math.MaxUint64 347 348 // Value with a lang type. 349 if len(t.Lang) > 0 { 350 id = farm.Fingerprint64([]byte(t.Lang)) 351 } else if schema.State().IsList(t.Attr) { 352 // TODO - When values are deleted for list type, then we should only delete the UID from 353 // index if no other values produces that index token. 354 // Value for list type. 355 id = farm.Fingerprint64(t.Value) 356 } 357 return id 358 } 359 360 // canMutateUid returns an error if all the following conditions are met. 361 // * Predicate is of type UidID. 362 // * Predicate is not set to a list of UIDs in the schema. 363 // * The existing posting list has an entry that does not match the proposed 364 // mutation's UID. 365 // In this case, the user should delete the existing predicate and retry, or mutate 366 // the schema to allow for multiple UIDs. This method is necessary to support UID 367 // predicates with single values because previously all UID predicates were 368 // considered lists. 369 // This functions returns a nil error in all other cases. 370 func (l *List) canMutateUid(txn *Txn, edge *pb.DirectedEdge) error { 371 l.AssertRLock() 372 373 if types.TypeID(edge.ValueType) != types.UidID { 374 return nil 375 } 376 377 if schema.State().IsList(edge.Attr) { 378 return nil 379 } 380 381 return l.iterate(txn.StartTs, 0, func(obj *pb.Posting) error { 382 if obj.Uid != edge.GetValueId() { 383 return errors.Errorf( 384 "cannot add value with uid %x to predicate %s because one of the existing "+ 385 "values does not match this uid, either delete the existing values first or "+ 386 "modify the schema to '%s: [uid]'", 387 edge.GetValueId(), edge.Attr, edge.Attr) 388 } 389 return nil 390 }) 391 } 392 393 func (l *List) addMutation(ctx context.Context, txn *Txn, t *pb.DirectedEdge) error { 394 l.Lock() 395 defer l.Unlock() 396 return l.addMutationInternal(ctx, txn, t) 397 } 398 399 var postingPool = &sync.Pool{ 400 New: func() interface{} { 401 return &pb.Posting{} 402 }, 403 } 404 405 func (l *List) release() { 406 fromList := func(list *pb.PostingList) { 407 for _, p := range list.GetPostings() { 408 postingPool.Put(p) 409 } 410 } 411 fromList(l.plist) 412 for _, plist := range l.mutationMap { 413 fromList(plist) 414 } 415 l.plist = nil 416 l.mutationMap = nil 417 } 418 419 func (l *List) addMutationInternal(ctx context.Context, txn *Txn, t *pb.DirectedEdge) error { 420 l.AssertLock() 421 422 if txn.ShouldAbort() { 423 return y.ErrConflict 424 } 425 426 getKey := func(key []byte, uid uint64) uint64 { 427 // Instead of creating a string first and then doing a fingerprint, let's do a fingerprint 428 // here to save memory allocations. 429 // Not entirely sure about effect on collision chances due to this simple XOR with uid. 430 return farm.Fingerprint64(key) ^ uid 431 } 432 433 mpost := NewPosting(t) 434 mpost.StartTs = txn.StartTs 435 if mpost.PostingType != pb.Posting_REF { 436 t.ValueId = fingerprintEdge(t) 437 mpost.Uid = t.ValueId 438 } 439 l.updateMutationLayer(mpost) 440 441 // We ensure that commit marks are applied to posting lists in the right 442 // order. We can do so by proposing them in the same order as received by the Oracle delta 443 // stream from Zero, instead of in goroutines. 444 var conflictKey uint64 445 pk, err := x.Parse(l.key) 446 if err != nil { 447 return err 448 } 449 switch { 450 case schema.State().HasUpsert(t.Attr): 451 // Consider checking to see if a email id is unique. A user adds: 452 // <uid> <email> "email@email.org", and there's a string equal tokenizer 453 // and upsert directive on the schema. 454 // Then keys are "<email> <uid>" and "<email> email@email.org" 455 // The first key won't conflict, because two different UIDs can try to 456 // get the same email id. But, the second key would. Thus, we ensure 457 // that two users don't set the same email id. 458 conflictKey = getKey(l.key, 0) 459 460 case pk.IsData() && schema.State().IsList(t.Attr): 461 // Data keys, irrespective of whether they are UID or values, should be judged based on 462 // whether they are lists or not. For UID, t.ValueId = UID. For value, t.ValueId = 463 // fingerprint(value) or could be fingerprint(lang) or something else. 464 // 465 // For singular uid predicate, like partner: uid // no list. 466 // a -> b 467 // a -> c 468 // Run concurrently, only one of them should succeed. 469 // But for friend: [uid], both should succeed. 470 // 471 // Similarly, name: string 472 // a -> "x" 473 // a -> "y" 474 // This should definitely have a conflict. 475 // But, if name: [string], then they can both succeed. 476 conflictKey = getKey(l.key, t.ValueId) 477 478 case pk.IsData(): // NOT a list. This case must happen after the above case. 479 conflictKey = getKey(l.key, 0) 480 481 case pk.IsIndex() || pk.IsCount(): 482 // Index keys are by default of type [uid]. 483 conflictKey = getKey(l.key, t.ValueId) 484 485 default: 486 // Don't assign a conflictKey. 487 } 488 txn.addConflictKey(conflictKey) 489 return nil 490 } 491 492 // getMutation returns a marshaled version of posting list mutation stored internally. 493 func (l *List) getMutation(startTs uint64) []byte { 494 l.RLock() 495 defer l.RUnlock() 496 if pl, ok := l.mutationMap[startTs]; ok { 497 data, err := pl.Marshal() 498 x.Check(err) 499 return data 500 } 501 return nil 502 } 503 504 func (l *List) setMutation(startTs uint64, data []byte) { 505 pl := new(pb.PostingList) 506 x.Check(pl.Unmarshal(data)) 507 508 l.Lock() 509 l.mutationMap[startTs] = pl 510 l.Unlock() 511 } 512 513 // Iterate will allow you to iterate over this posting List, while having acquired a read lock. 514 // So, please keep this iteration cheap, otherwise mutations would get stuck. 515 // The iteration will start after the provided UID. The results would not include this uid. 516 // The function will loop until either the posting List is fully iterated, or you return a false 517 // in the provided function, which will indicate to the function to break out of the iteration. 518 // 519 // pl.Iterate(..., func(p *pb.posting) error { 520 // // Use posting p 521 // return nil // to continue iteration. 522 // return errStopIteration // to break iteration. 523 // }) 524 func (l *List) Iterate(readTs uint64, afterUid uint64, f func(obj *pb.Posting) error) error { 525 l.RLock() 526 defer l.RUnlock() 527 return l.iterate(readTs, afterUid, f) 528 } 529 530 // pickPostings goes through the mutable layer and returns the appropriate postings, 531 // along with the timestamp of the delete marker, if any. If this timestamp is greater 532 // than zero, it indicates that the immutable layer should be ignored during traversals. 533 // If greater than zero, this timestamp must thus be greater than l.minTs. 534 func (l *List) pickPostings(readTs uint64) (uint64, []*pb.Posting) { 535 // This function would return zero ts for entries above readTs. 536 effective := func(start, commit uint64) uint64 { 537 if commit > 0 && commit <= readTs { 538 // Has been committed and below the readTs. 539 return commit 540 } 541 if start == readTs { 542 // This mutation is by ME. So, I must be able to read it. 543 return start 544 } 545 return 0 546 } 547 548 // First pick up the postings. 549 var deleteBelowTs uint64 550 var posts []*pb.Posting 551 for startTs, plist := range l.mutationMap { 552 // Pick up the transactions which are either committed, or the one which is ME. 553 effectiveTs := effective(startTs, plist.CommitTs) 554 if effectiveTs > deleteBelowTs { 555 // We're above the deleteBelowTs marker. We wouldn't reach here if effectiveTs is zero. 556 for _, mpost := range plist.Postings { 557 if hasDeleteAll(mpost) { 558 deleteBelowTs = effectiveTs 559 continue 560 } 561 posts = append(posts, mpost) 562 } 563 } 564 } 565 566 if deleteBelowTs > 0 { 567 // There was a delete all marker. So, trim down the list of postings. 568 result := posts[:0] 569 for _, post := range posts { 570 effectiveTs := effective(post.StartTs, post.CommitTs) 571 if effectiveTs < deleteBelowTs { // Do pick the posts at effectiveTs == deleteBelowTs. 572 continue 573 } 574 result = append(result, post) 575 } 576 posts = result 577 } 578 579 // Sort all the postings by UID (inc order), then by commit/startTs in dec order. 580 sort.Slice(posts, func(i, j int) bool { 581 pi := posts[i] 582 pj := posts[j] 583 if pi.Uid == pj.Uid { 584 ei := effective(pi.StartTs, pi.CommitTs) 585 ej := effective(pj.StartTs, pj.CommitTs) 586 return ei > ej // Pick the higher, so we can discard older commits for the same UID. 587 } 588 return pi.Uid < pj.Uid 589 }) 590 return deleteBelowTs, posts 591 } 592 593 func (l *List) iterate(readTs uint64, afterUid uint64, f func(obj *pb.Posting) error) error { 594 l.AssertRLock() 595 596 deleteBelowTs, mposts := l.pickPostings(readTs) 597 if readTs < l.minTs { 598 return errors.Errorf("readTs: %d less than minTs: %d for key: %q", readTs, l.minTs, l.key) 599 } 600 601 midx, mlen := 0, len(mposts) 602 if afterUid > 0 { 603 midx = sort.Search(mlen, func(idx int) bool { 604 mp := mposts[idx] 605 return afterUid < mp.Uid 606 }) 607 } 608 609 var ( 610 mp, pp *pb.Posting 611 pitr pIterator 612 prevUid uint64 613 err error 614 ) 615 err = pitr.init(l, afterUid, deleteBelowTs) 616 if err != nil { 617 return err 618 } 619 for err == nil { 620 if midx < mlen { 621 mp = mposts[midx] 622 } else { 623 mp = emptyPosting 624 } 625 if valid, err := pitr.valid(); err != nil { 626 return err 627 } else if valid { 628 pp = pitr.posting() 629 } else { 630 pp = emptyPosting 631 } 632 633 switch { 634 case mp.Uid > 0 && mp.Uid == prevUid: 635 // Only pick the latest version of this posting. 636 // mp.Uid can be zero if it's an empty posting. 637 midx++ 638 case pp.Uid == 0 && mp.Uid == 0: 639 // Reached empty posting for both iterators. 640 return nil 641 case mp.Uid == 0 || (pp.Uid > 0 && pp.Uid < mp.Uid): 642 // Either mp is empty, or pp is lower than mp. 643 err = f(pp) 644 if err := pitr.next(); err != nil { 645 return err 646 } 647 case pp.Uid == 0 || (mp.Uid > 0 && mp.Uid < pp.Uid): 648 // Either pp is empty, or mp is lower than pp. 649 if mp.Op != Del { 650 err = f(mp) 651 } 652 prevUid = mp.Uid 653 midx++ 654 case pp.Uid == mp.Uid: 655 if mp.Op != Del { 656 err = f(mp) 657 } 658 prevUid = mp.Uid 659 if err := pitr.next(); err != nil { 660 return err 661 } 662 midx++ 663 default: 664 log.Fatalf("Unhandled case during iteration of posting list.") 665 } 666 } 667 if err == ErrStopIteration { 668 return nil 669 } 670 return err 671 } 672 673 // IsEmpty returns true if there are no uids at the given timestamp after the given UID. 674 func (l *List) IsEmpty(readTs, afterUid uint64) (bool, error) { 675 l.RLock() 676 defer l.RUnlock() 677 var count int 678 err := l.iterate(readTs, afterUid, func(p *pb.Posting) error { 679 count++ 680 return ErrStopIteration 681 }) 682 if err != nil { 683 return false, err 684 } 685 return count == 0, nil 686 } 687 688 func (l *List) length(readTs, afterUid uint64) int { 689 l.AssertRLock() 690 count := 0 691 err := l.iterate(readTs, afterUid, func(p *pb.Posting) error { 692 count++ 693 return nil 694 }) 695 if err != nil { 696 return -1 697 } 698 return count 699 } 700 701 // Length iterates over the mutation layer and counts number of elements. 702 func (l *List) Length(readTs, afterUid uint64) int { 703 l.RLock() 704 defer l.RUnlock() 705 return l.length(readTs, afterUid) 706 } 707 708 // Rollup performs the rollup process, merging the immutable and mutable layers 709 // and outputting the resulting list so it can be written to disk. 710 // During this process, the list might be split into multiple lists if the main 711 // list or any of the existing parts become too big. 712 // 713 // A normal list has the following format: 714 // <key> -> <posting list with all the data for this list> 715 // 716 // A multi-part list is stored in multiple keys. The keys for the parts will be generated by 717 // appending the first UID in the part to the key. The list will have the following format: 718 // <key> -> <posting list that includes no postings but a list of each part's start UID> 719 // <key, 1> -> <first part of the list with all the data for this part> 720 // <key, next start UID> -> <second part of the list with all the data for this part> 721 // ... 722 // <key, last start UID> -> <last part of the list with all its data> 723 // 724 // The first part of a multi-part list always has start UID 1 and will be the last part 725 // to be deleted, at which point the entire list will be marked for deletion. 726 // As the list grows, existing parts might be split if they become too big. 727 func (l *List) Rollup() ([]*bpb.KV, error) { 728 l.RLock() 729 defer l.RUnlock() 730 out, err := l.rollup(math.MaxUint64) 731 if err != nil { 732 return nil, err 733 } 734 if out == nil { 735 return nil, nil 736 } 737 738 var kvs []*bpb.KV 739 kv := &bpb.KV{} 740 kv.Version = out.newMinTs 741 kv.Key = l.key 742 val, meta := marshalPostingList(out.plist) 743 kv.UserMeta = []byte{meta} 744 kv.Value = val 745 kvs = append(kvs, kv) 746 747 for startUid, plist := range out.parts { 748 // Any empty posting list would still have BitEmpty set. And the main posting list 749 // would NOT have that posting list startUid in the splits list. 750 kv := out.marshalPostingListPart(l.key, startUid, plist) 751 kvs = append(kvs, kv) 752 } 753 754 return kvs, nil 755 } 756 757 func (out *rollupOutput) marshalPostingListPart( 758 baseKey []byte, startUid uint64, plist *pb.PostingList) *bpb.KV { 759 kv := &bpb.KV{} 760 kv.Version = out.newMinTs 761 key, err := x.GetSplitKey(baseKey, startUid) 762 x.Check(err) 763 kv.Key = key 764 val, meta := marshalPostingList(plist) 765 kv.UserMeta = []byte{meta} 766 kv.Value = val 767 768 return kv 769 } 770 771 func marshalPostingList(plist *pb.PostingList) ([]byte, byte) { 772 if isPlistEmpty(plist) { 773 return nil, BitEmptyPosting 774 } 775 776 data, err := plist.Marshal() 777 x.Check(err) 778 return data, BitCompletePosting 779 } 780 781 const blockSize int = 256 782 783 type rollupOutput struct { 784 plist *pb.PostingList 785 parts map[uint64]*pb.PostingList 786 newMinTs uint64 787 } 788 789 // Merge all entries in mutation layer with commitTs <= l.commitTs into 790 // immutable layer. Note that readTs can be math.MaxUint64, so do NOT use it 791 // directly. It should only serve as the read timestamp for iteration. 792 func (l *List) rollup(readTs uint64) (*rollupOutput, error) { 793 l.AssertRLock() 794 795 // Pick all committed entries 796 if l.minTs > readTs { 797 // If we are already past the readTs, then skip the rollup. 798 return nil, nil 799 } 800 801 out := &rollupOutput{ 802 plist: &pb.PostingList{ 803 Splits: l.plist.Splits, 804 }, 805 parts: make(map[uint64]*pb.PostingList), 806 } 807 808 var plist *pb.PostingList 809 var enc codec.Encoder 810 var startUid, endUid uint64 811 var splitIdx int 812 813 // Method to properly initialize all the variables described above. 814 init := func() { 815 enc = codec.Encoder{BlockSize: blockSize} 816 817 // If not a multi-part list, all UIDs go to the same encoder. 818 if len(l.plist.Splits) == 0 { 819 plist = out.plist 820 endUid = math.MaxUint64 821 return 822 } 823 824 // Otherwise, load the corresponding part and set endUid to correctly 825 // detect the end of the list. 826 startUid = l.plist.Splits[splitIdx] 827 if splitIdx+1 == len(l.plist.Splits) { 828 endUid = math.MaxUint64 829 } else { 830 endUid = l.plist.Splits[splitIdx+1] - 1 831 } 832 833 plist = &pb.PostingList{} 834 } 835 836 init() 837 err := l.iterate(readTs, 0, func(p *pb.Posting) error { 838 if p.Uid > endUid { 839 plist.Pack = enc.Done() 840 out.parts[startUid] = plist 841 842 splitIdx++ 843 init() 844 } 845 846 enc.Add(p.Uid) 847 if p.Facets != nil || p.PostingType != pb.Posting_REF || len(p.Label) != 0 { 848 plist.Postings = append(plist.Postings, p) 849 } 850 return nil 851 }) 852 // Finish writing the last part of the list (or the whole list if not a multi-part list). 853 x.Check(err) 854 plist.Pack = enc.Done() 855 if len(l.plist.Splits) > 0 { 856 out.parts[startUid] = plist 857 } 858 859 maxCommitTs := l.minTs 860 { 861 // We can't rely upon iterate to give us the max commit timestamp, because it can skip over 862 // postings which had deletions to provide a sorted view of the list. Therefore, the safest 863 // way to get the max commit timestamp is to pick all the relevant postings for the given 864 // readTs and calculate the maxCommitTs. 865 // If deleteBelowTs is greater than zero, there was a delete all marker. The list of 866 // postings has been trimmed down. 867 deleteBelowTs, mposts := l.pickPostings(readTs) 868 maxCommitTs = x.Max(maxCommitTs, deleteBelowTs) 869 for _, mp := range mposts { 870 maxCommitTs = x.Max(maxCommitTs, mp.CommitTs) 871 } 872 } 873 874 // Check if the list (or any of it's parts if it's been previously split) have 875 // become too big. Split the list if that is the case. 876 out.newMinTs = maxCommitTs 877 out.splitUpList() 878 out.removeEmptySplits() 879 return out, nil 880 } 881 882 // ApproxLen returns an approximate count of the UIDs in the posting list. 883 func (l *List) ApproxLen() int { 884 l.RLock() 885 defer l.RUnlock() 886 return len(l.mutationMap) + codec.ApproxLen(l.plist.Pack) 887 } 888 889 // Uids returns the UIDs given some query params. 890 // We have to apply the filtering before applying (offset, count). 891 // WARNING: Calling this function just to get UIDs is expensive 892 func (l *List) Uids(opt ListOptions) (*pb.List, error) { 893 // Pre-assign length to make it faster. 894 l.RLock() 895 // Use approximate length for initial capacity. 896 res := make([]uint64, 0, len(l.mutationMap)+codec.ApproxLen(l.plist.Pack)) 897 out := &pb.List{} 898 if len(l.mutationMap) == 0 && opt.Intersect != nil && len(l.plist.Splits) == 0 { 899 if opt.ReadTs < l.minTs { 900 l.RUnlock() 901 return out, ErrTsTooOld 902 } 903 algo.IntersectCompressedWith(l.plist.Pack, opt.AfterUid, opt.Intersect, out) 904 l.RUnlock() 905 return out, nil 906 } 907 908 err := l.iterate(opt.ReadTs, opt.AfterUid, func(p *pb.Posting) error { 909 if p.PostingType == pb.Posting_REF { 910 res = append(res, p.Uid) 911 } 912 return nil 913 }) 914 l.RUnlock() 915 if err != nil { 916 return out, err 917 } 918 919 // Do The intersection here as it's optimized. 920 out.Uids = res 921 if opt.Intersect != nil { 922 algo.IntersectWith(out, opt.Intersect, out) 923 } 924 return out, nil 925 } 926 927 // Postings calls postFn with the postings that are common with 928 // UIDs in the opt ListOptions. 929 func (l *List) Postings(opt ListOptions, postFn func(*pb.Posting) error) error { 930 l.RLock() 931 defer l.RUnlock() 932 933 return l.iterate(opt.ReadTs, opt.AfterUid, func(p *pb.Posting) error { 934 if p.PostingType != pb.Posting_REF { 935 return nil 936 } 937 return postFn(p) 938 }) 939 } 940 941 // AllUntaggedValues returns all the values in the posting list with no language tag. 942 func (l *List) AllUntaggedValues(readTs uint64) ([]types.Val, error) { 943 l.RLock() 944 defer l.RUnlock() 945 946 var vals []types.Val 947 err := l.iterate(readTs, 0, func(p *pb.Posting) error { 948 if len(p.LangTag) == 0 { 949 vals = append(vals, types.Val{ 950 Tid: types.TypeID(p.ValType), 951 Value: p.Value, 952 }) 953 } 954 return nil 955 }) 956 return vals, err 957 } 958 959 // AllValues returns all the values in the posting list. 960 func (l *List) AllValues(readTs uint64) ([]types.Val, error) { 961 l.RLock() 962 defer l.RUnlock() 963 964 var vals []types.Val 965 err := l.iterate(readTs, 0, func(p *pb.Posting) error { 966 vals = append(vals, types.Val{ 967 Tid: types.TypeID(p.ValType), 968 Value: p.Value, 969 }) 970 return nil 971 }) 972 return vals, err 973 } 974 975 // GetLangTags finds the language tags of each posting in the list. 976 func (l *List) GetLangTags(readTs uint64) ([]string, error) { 977 l.RLock() 978 defer l.RUnlock() 979 980 var tags []string 981 err := l.iterate(readTs, 0, func(p *pb.Posting) error { 982 tags = append(tags, string(p.LangTag)) 983 return nil 984 }) 985 return tags, err 986 } 987 988 // Value returns the default value from the posting list. The default value is 989 // defined as the value without a language tag. 990 func (l *List) Value(readTs uint64) (rval types.Val, rerr error) { 991 l.RLock() 992 defer l.RUnlock() 993 val, found, err := l.findValue(readTs, math.MaxUint64) 994 if err != nil { 995 return val, err 996 } 997 if !found { 998 return val, ErrNoValue 999 } 1000 return val, nil 1001 } 1002 1003 // ValueFor returns a value from posting list, according to preferred language list. 1004 // If list is empty, value without language is returned; if such value is not 1005 // available, value with smallest UID is returned. 1006 // If list consists of one or more languages, first available value is returned. 1007 // If no language from the list matches the values, processing is the same as for empty list. 1008 func (l *List) ValueFor(readTs uint64, langs []string) (rval types.Val, rerr error) { 1009 l.RLock() // All public methods should acquire locks, while private ones should assert them. 1010 defer l.RUnlock() 1011 p, err := l.postingFor(readTs, langs) 1012 if err != nil { 1013 return rval, err 1014 } 1015 return valueToTypesVal(p), nil 1016 } 1017 1018 func (l *List) postingFor(readTs uint64, langs []string) (p *pb.Posting, rerr error) { 1019 l.AssertRLock() // Avoid recursive locking by asserting a lock here. 1020 return l.postingForLangs(readTs, langs) 1021 } 1022 1023 // ValueForTag returns the value in the posting list with the given language tag. 1024 func (l *List) ValueForTag(readTs uint64, tag string) (rval types.Val, rerr error) { 1025 l.RLock() 1026 defer l.RUnlock() 1027 p, err := l.postingForTag(readTs, tag) 1028 if err != nil { 1029 return rval, err 1030 } 1031 return valueToTypesVal(p), nil 1032 } 1033 1034 func valueToTypesVal(p *pb.Posting) (rval types.Val) { 1035 // This is ok because we dont modify the value of a posting. We create a newPosting 1036 // and add it to the PostingList to do a set. 1037 rval.Value = p.Value 1038 rval.Tid = types.TypeID(p.ValType) 1039 return 1040 } 1041 1042 func (l *List) postingForLangs(readTs uint64, langs []string) (pos *pb.Posting, rerr error) { 1043 l.AssertRLock() 1044 1045 any := false 1046 // look for language in preferred order 1047 for _, lang := range langs { 1048 if lang == "." { 1049 any = true 1050 break 1051 } 1052 pos, rerr = l.postingForTag(readTs, lang) 1053 if rerr == nil { 1054 return pos, nil 1055 } 1056 } 1057 1058 // look for value without language 1059 if any || len(langs) == 0 { 1060 if found, pos, err := l.findPosting(readTs, math.MaxUint64); err != nil { 1061 return nil, err 1062 } else if found { 1063 return pos, nil 1064 } 1065 } 1066 1067 var found bool 1068 // last resort - return value with smallest lang UID. 1069 if any { 1070 err := l.iterate(readTs, 0, func(p *pb.Posting) error { 1071 if p.PostingType == pb.Posting_VALUE_LANG { 1072 pos = p 1073 found = true 1074 return ErrStopIteration 1075 } 1076 return nil 1077 }) 1078 if err != nil { 1079 return nil, err 1080 } 1081 } 1082 1083 if found { 1084 return pos, nil 1085 } 1086 1087 return pos, ErrNoValue 1088 } 1089 1090 func (l *List) postingForTag(readTs uint64, tag string) (p *pb.Posting, rerr error) { 1091 l.AssertRLock() 1092 uid := farm.Fingerprint64([]byte(tag)) 1093 found, p, err := l.findPosting(readTs, uid) 1094 if err != nil { 1095 return p, err 1096 } 1097 if !found { 1098 return p, ErrNoValue 1099 } 1100 1101 return p, nil 1102 } 1103 1104 func (l *List) findValue(readTs, uid uint64) (rval types.Val, found bool, err error) { 1105 l.AssertRLock() 1106 found, p, err := l.findPosting(readTs, uid) 1107 if !found { 1108 return rval, found, err 1109 } 1110 1111 return valueToTypesVal(p), true, nil 1112 } 1113 1114 func (l *List) findPosting(readTs uint64, uid uint64) (found bool, pos *pb.Posting, err error) { 1115 // Iterate starts iterating after the given argument, so we pass UID - 1 1116 err = l.iterate(readTs, uid-1, func(p *pb.Posting) error { 1117 if p.Uid == uid { 1118 pos = p 1119 found = true 1120 } 1121 return ErrStopIteration 1122 }) 1123 1124 return found, pos, err 1125 } 1126 1127 // Facets gives facets for the posting representing value. 1128 func (l *List) Facets(readTs uint64, param *pb.FacetParams, langs []string) (fs []*api.Facet, 1129 ferr error) { 1130 l.RLock() 1131 defer l.RUnlock() 1132 p, err := l.postingFor(readTs, langs) 1133 if err != nil { 1134 return nil, err 1135 } 1136 return facets.CopyFacets(p.Facets, param), nil 1137 } 1138 1139 func (l *List) readListPart(startUid uint64) (*pb.PostingList, error) { 1140 key, err := x.GetSplitKey(l.key, startUid) 1141 if err != nil { 1142 return nil, err 1143 } 1144 txn := pstore.NewTransactionAt(l.minTs, false) 1145 item, err := txn.Get(key) 1146 if err != nil { 1147 return nil, err 1148 } 1149 part := &pb.PostingList{} 1150 if err := unmarshalOrCopy(part, item); err != nil { 1151 return nil, err 1152 } 1153 return part, nil 1154 } 1155 1156 // shouldSplit returns true if the given plist should be split in two. 1157 func shouldSplit(plist *pb.PostingList) bool { 1158 return plist.Size() >= maxListSize && len(plist.Pack.Blocks) > 1 1159 } 1160 1161 // splitUpList checks the list and splits it in smaller parts if needed. 1162 func (out *rollupOutput) splitUpList() { 1163 // Contains the posting lists that should be split. 1164 var lists []*pb.PostingList 1165 1166 // If list is not split yet, insert the main list. 1167 if len(out.plist.Splits) == 0 { 1168 lists = append(lists, out.plist) 1169 } 1170 1171 // Insert the split lists if they exist. 1172 for _, startUid := range out.splits() { 1173 part := out.parts[startUid] 1174 lists = append(lists, part) 1175 } 1176 1177 // List of startUids for each list part after the splitting process is complete. 1178 var newSplits []uint64 1179 1180 for i, list := range lists { 1181 startUid := uint64(1) 1182 // If the list is split, select the right startUid for this list. 1183 if len(out.plist.Splits) > 0 { 1184 startUid = out.plist.Splits[i] 1185 } 1186 1187 if shouldSplit(list) { 1188 // Split the list. Update out.splits with the new lists and add their 1189 // start UIDs to the list of new splits. 1190 startUids, pls := binSplit(startUid, list) 1191 for i, startUid := range startUids { 1192 out.parts[startUid] = pls[i] 1193 newSplits = append(newSplits, startUid) 1194 } 1195 } else { 1196 // No need to split the list. Add the startUid to the array of new splits. 1197 newSplits = append(newSplits, startUid) 1198 } 1199 } 1200 1201 // No new lists were created so there's no need to update the list of splits. 1202 if len(newSplits) == len(lists) { 1203 return 1204 } 1205 1206 // The splits changed so update them. 1207 out.plist = &pb.PostingList{ 1208 Splits: newSplits, 1209 } 1210 } 1211 1212 // binSplit takes the given plist and returns two new plists, each with 1213 // half of the blocks and postings of the original as well as the new startUids 1214 // for each of the new parts. 1215 func binSplit(lowUid uint64, plist *pb.PostingList) ([]uint64, []*pb.PostingList) { 1216 midBlock := len(plist.Pack.Blocks) / 2 1217 midUid := plist.Pack.Blocks[midBlock].GetBase() 1218 1219 // Generate posting list holding the first half of the current list's postings. 1220 lowPl := new(pb.PostingList) 1221 lowPl.Pack = &pb.UidPack{ 1222 BlockSize: plist.Pack.BlockSize, 1223 Blocks: plist.Pack.Blocks[:midBlock], 1224 } 1225 1226 // Generate posting list holding the second half of the current list's postings. 1227 highPl := new(pb.PostingList) 1228 highPl.Pack = &pb.UidPack{ 1229 BlockSize: plist.Pack.BlockSize, 1230 Blocks: plist.Pack.Blocks[midBlock:], 1231 } 1232 1233 // Add elements in plist.Postings to the corresponding list. 1234 for _, posting := range plist.Postings { 1235 if posting.Uid < midUid { 1236 lowPl.Postings = append(lowPl.Postings, posting) 1237 } else { 1238 highPl.Postings = append(highPl.Postings, posting) 1239 } 1240 } 1241 1242 return []uint64{lowUid, midUid}, []*pb.PostingList{lowPl, highPl} 1243 } 1244 1245 // removeEmptySplits updates the split list by removing empty posting lists' startUids. 1246 func (out *rollupOutput) removeEmptySplits() { 1247 var splits []uint64 1248 for startUid, plist := range out.parts { 1249 // Do not remove the first split for now, as every multi-part list should always 1250 // have a split starting with UID 1. 1251 if startUid == 1 { 1252 splits = append(splits, startUid) 1253 continue 1254 } 1255 1256 if !isPlistEmpty(plist) { 1257 splits = append(splits, startUid) 1258 } 1259 } 1260 out.plist.Splits = splits 1261 sortSplits(splits) 1262 1263 if len(out.plist.Splits) == 1 { 1264 // Only the first split remains. If it's also empty, remove it as well. 1265 // This should mark the entire list for deletion. 1266 if isPlistEmpty(out.parts[1]) { 1267 out.plist.Splits = []uint64{} 1268 } 1269 } 1270 } 1271 1272 // Returns the sorted list of start UIDs based on the keys in out.parts. 1273 // out.parts is considered the source of truth so this method is considered 1274 // safer than using out.plist.Splits directly. 1275 func (out *rollupOutput) splits() []uint64 { 1276 var splits []uint64 1277 for startUid := range out.parts { 1278 splits = append(splits, startUid) 1279 } 1280 sortSplits(splits) 1281 return splits 1282 } 1283 1284 // isPlistEmpty returns true if the given plist is empty. Plists with splits are 1285 // considered non-empty. 1286 func isPlistEmpty(plist *pb.PostingList) bool { 1287 if len(plist.Splits) > 0 { 1288 return false 1289 } 1290 if plist.Pack == nil || len(plist.Pack.Blocks) == 0 { 1291 return true 1292 } 1293 return false 1294 } 1295 1296 func sortSplits(splits []uint64) { 1297 sort.Slice(splits, func(i, j int) bool { 1298 return splits[i] < splits[j] 1299 }) 1300 } 1301 1302 // PartSplits returns an empty array if the list has not been split into multiple parts. 1303 // Otherwise, it returns an array containing the start UID of each part. 1304 func (l *List) PartSplits() []uint64 { 1305 splits := make([]uint64, len(l.plist.Splits)) 1306 copy(splits, l.plist.Splits) 1307 return splits 1308 } 1309 1310 // ToBackupPostingList converts a posting list into its representation used for storing backups. 1311 func ToBackupPostingList(l *pb.PostingList) *pb.BackupPostingList { 1312 bl := pb.BackupPostingList{} 1313 if l == nil { 1314 return &bl 1315 } 1316 1317 bl.Uids = codec.Decode(l.Pack, 0) 1318 bl.Postings = l.Postings 1319 bl.CommitTs = l.CommitTs 1320 bl.Splits = l.Splits 1321 return &bl 1322 } 1323 1324 // FromBackupPostingList converts a posting list in the format used for backups to a 1325 // normal posting list. 1326 func FromBackupPostingList(bl *pb.BackupPostingList) *pb.PostingList { 1327 l := pb.PostingList{} 1328 if bl == nil { 1329 return &l 1330 } 1331 1332 l.Pack = codec.Encode(bl.Uids, blockSize) 1333 l.Postings = bl.Postings 1334 l.CommitTs = bl.CommitTs 1335 l.Splits = bl.Splits 1336 return &l 1337 }