github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/inverted/delta_merger.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package inverted
    13  
    14  // DeltaMerger can be used to condense the number of single writes into one big
    15  // one. Additionally it removes overlaps between additions and deletions. It is
    16  // meant to be used in batch situation, where 5 ref objects in a row might each
    17  // increase the doc count by one. Instead of writing 5 additions and 4
    18  // deletions, this can be condensed to write just one addition
    19  type DeltaMerger struct {
    20  	additions propsByName
    21  	deletions propsByName
    22  }
    23  
    24  func NewDeltaMerger() *DeltaMerger {
    25  	return &DeltaMerger{
    26  		additions: propsByName{},
    27  		deletions: propsByName{},
    28  	}
    29  }
    30  
    31  func (dm *DeltaMerger) AddAdditions(props []Property, docID uint64) {
    32  	for _, prop := range props {
    33  		storedProp := dm.additions.getOrCreate(prop.Name)
    34  		storedProp.hasFilterableIndex = prop.HasFilterableIndex
    35  		storedProp.hasSearchableIndex = prop.HasSearchableIndex
    36  		for _, item := range prop.Items {
    37  			storedItem := storedProp.getOrCreateItem(item.Data)
    38  			storedItem.addDocIDAndFrequency(docID, item.TermFrequency)
    39  		}
    40  	}
    41  }
    42  
    43  func (dm *DeltaMerger) AddDeletions(props []Property, docID uint64) {
    44  	for _, prop := range props {
    45  		additionProp := dm.additions.getOrCreate(prop.Name)
    46  		for _, item := range prop.Items {
    47  			additionItem := additionProp.getOrCreateItem(item.Data)
    48  			ok := additionItem.deleteIfPresent(docID)
    49  			if ok {
    50  				// we are done with this prop, no need to register an explicit deletion
    51  				continue
    52  			}
    53  
    54  			// this was not added by us, we need to remove it
    55  			deletionProp := dm.deletions.getOrCreate(prop.Name)
    56  			deletionProp.hasFilterableIndex = prop.HasFilterableIndex
    57  			deletionProp.hasSearchableIndex = prop.HasSearchableIndex
    58  			deletionItem := deletionProp.getOrCreateItem(item.Data)
    59  			deletionItem.addDocIDAndFrequency(docID, 0) // frequency does not matter on deletion
    60  		}
    61  	}
    62  }
    63  
    64  func (dm *DeltaMerger) Merge() DeltaMergeResult {
    65  	return DeltaMergeResult{
    66  		Additions: dm.additions.merge(),
    67  		Deletions: dm.deletions.merge(),
    68  	}
    69  }
    70  
    71  type DeltaMergeResult struct {
    72  	Additions []MergeProperty
    73  	Deletions []MergeProperty
    74  }
    75  
    76  type MergeProperty struct {
    77  	Name               string
    78  	MergeItems         []MergeItem
    79  	HasFilterableIndex bool
    80  	HasSearchableIndex bool
    81  }
    82  
    83  type MergeItem struct {
    84  	Data   []byte
    85  	DocIDs []MergeDocIDWithFrequency
    86  }
    87  
    88  // IDs is meant for cases such as deletion, where the frequency is irrelevant,
    89  // but the expected format is a []docID
    90  func (mi MergeItem) IDs() []uint64 {
    91  	out := make([]uint64, len(mi.DocIDs))
    92  	for i, tuple := range mi.DocIDs {
    93  		out[i] = tuple.DocID
    94  	}
    95  
    96  	return out
    97  }
    98  
    99  // Countable converts the merge item to a regular (non-merge) Countable. Note
   100  // that this loses the IDs and Frequency information, so IDs have to be passed
   101  // separately using .IDs()
   102  func (mi MergeItem) Countable() Countable {
   103  	return Countable{
   104  		Data: mi.Data,
   105  	}
   106  }
   107  
   108  type MergeDocIDWithFrequency struct {
   109  	DocID     uint64
   110  	Frequency float32
   111  }
   112  
   113  type propsByName map[string]*propWithDocIDs
   114  
   115  func (pbn propsByName) getOrCreate(name string) *propWithDocIDs {
   116  	prop, ok := pbn[name]
   117  	if ok {
   118  		return prop
   119  	}
   120  	prop = &propWithDocIDs{name: name, items: map[string]*countableWithDocIDs{}}
   121  	pbn[name] = prop
   122  	return prop
   123  }
   124  
   125  func (pbn propsByName) merge() []MergeProperty {
   126  	out := make([]MergeProperty, len(pbn))
   127  	i := 0
   128  	for _, prop := range pbn {
   129  		mergedProp := prop.merge()
   130  		if mergedProp == nil {
   131  			continue
   132  		}
   133  		out[i] = *mergedProp
   134  		i++
   135  	}
   136  
   137  	if i == 0 {
   138  		return nil
   139  	}
   140  
   141  	return out[:i]
   142  }
   143  
   144  type propWithDocIDs struct {
   145  	name               string
   146  	items              map[string]*countableWithDocIDs
   147  	hasFilterableIndex bool
   148  	hasSearchableIndex bool
   149  }
   150  
   151  func (pwd *propWithDocIDs) getOrCreateItem(data []byte) *countableWithDocIDs {
   152  	name := string(data)
   153  	item, ok := pwd.items[name]
   154  	if ok {
   155  		return item
   156  	}
   157  	item = &countableWithDocIDs{
   158  		value:  data,
   159  		docIDs: map[uint64]float32{},
   160  	}
   161  	pwd.items[name] = item
   162  	return item
   163  }
   164  
   165  func (pwd *propWithDocIDs) merge() *MergeProperty {
   166  	items := make([]MergeItem, len(pwd.items))
   167  
   168  	i := 0
   169  	for _, item := range pwd.items {
   170  		mergedItem := item.merge()
   171  		if mergedItem == nil {
   172  			continue
   173  		}
   174  
   175  		items[i] = *mergedItem
   176  		i++
   177  	}
   178  
   179  	if i == 0 {
   180  		return nil
   181  	}
   182  
   183  	return &MergeProperty{
   184  		Name:               pwd.name,
   185  		MergeItems:         items[:i],
   186  		HasFilterableIndex: pwd.hasFilterableIndex,
   187  		HasSearchableIndex: pwd.hasSearchableIndex,
   188  	}
   189  }
   190  
   191  type countableWithDocIDs struct {
   192  	value  []byte
   193  	docIDs map[uint64]float32 // map[docid]frequency
   194  }
   195  
   196  func (cwd *countableWithDocIDs) addDocIDAndFrequency(docID uint64, freq float32) {
   197  	cwd.docIDs[docID] = freq
   198  }
   199  
   200  func (cwd *countableWithDocIDs) deleteIfPresent(docID uint64) bool {
   201  	_, ok := cwd.docIDs[docID]
   202  	if !ok {
   203  		return false
   204  	}
   205  
   206  	delete(cwd.docIDs, docID)
   207  	return true
   208  }
   209  
   210  func (cwd *countableWithDocIDs) merge() *MergeItem {
   211  	if len(cwd.docIDs) == 0 {
   212  		return nil
   213  	}
   214  
   215  	ids := make([]MergeDocIDWithFrequency, len(cwd.docIDs))
   216  	i := 0
   217  	for docID, freq := range cwd.docIDs {
   218  		ids[i] = MergeDocIDWithFrequency{DocID: docID, Frequency: freq}
   219  		i++
   220  	}
   221  
   222  	return &MergeItem{
   223  		Data:   cwd.value,
   224  		DocIDs: ids,
   225  	}
   226  }