github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/inverted/delta_merger.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package inverted 13 14 // DeltaMerger can be used to condense the number of single writes into one big 15 // one. Additionally it removes overlaps between additions and deletions. It is 16 // meant to be used in batch situation, where 5 ref objects in a row might each 17 // increase the doc count by one. Instead of writing 5 additions and 4 18 // deletions, this can be condensed to write just one addition 19 type DeltaMerger struct { 20 additions propsByName 21 deletions propsByName 22 } 23 24 func NewDeltaMerger() *DeltaMerger { 25 return &DeltaMerger{ 26 additions: propsByName{}, 27 deletions: propsByName{}, 28 } 29 } 30 31 func (dm *DeltaMerger) AddAdditions(props []Property, docID uint64) { 32 for _, prop := range props { 33 storedProp := dm.additions.getOrCreate(prop.Name) 34 storedProp.hasFilterableIndex = prop.HasFilterableIndex 35 storedProp.hasSearchableIndex = prop.HasSearchableIndex 36 for _, item := range prop.Items { 37 storedItem := storedProp.getOrCreateItem(item.Data) 38 storedItem.addDocIDAndFrequency(docID, item.TermFrequency) 39 } 40 } 41 } 42 43 func (dm *DeltaMerger) AddDeletions(props []Property, docID uint64) { 44 for _, prop := range props { 45 additionProp := dm.additions.getOrCreate(prop.Name) 46 for _, item := range prop.Items { 47 additionItem := additionProp.getOrCreateItem(item.Data) 48 ok := additionItem.deleteIfPresent(docID) 49 if ok { 50 // we are done with this prop, no need to register an explicit deletion 51 continue 52 } 53 54 // this was not added by us, we need to remove it 55 deletionProp := dm.deletions.getOrCreate(prop.Name) 56 deletionProp.hasFilterableIndex = prop.HasFilterableIndex 57 deletionProp.hasSearchableIndex = prop.HasSearchableIndex 58 deletionItem := deletionProp.getOrCreateItem(item.Data) 59 deletionItem.addDocIDAndFrequency(docID, 0) // frequency does not matter on deletion 60 } 61 } 62 } 63 64 func (dm *DeltaMerger) Merge() DeltaMergeResult { 65 return DeltaMergeResult{ 66 Additions: dm.additions.merge(), 67 Deletions: dm.deletions.merge(), 68 } 69 } 70 71 type DeltaMergeResult struct { 72 Additions []MergeProperty 73 Deletions []MergeProperty 74 } 75 76 type MergeProperty struct { 77 Name string 78 MergeItems []MergeItem 79 HasFilterableIndex bool 80 HasSearchableIndex bool 81 } 82 83 type MergeItem struct { 84 Data []byte 85 DocIDs []MergeDocIDWithFrequency 86 } 87 88 // IDs is meant for cases such as deletion, where the frequency is irrelevant, 89 // but the expected format is a []docID 90 func (mi MergeItem) IDs() []uint64 { 91 out := make([]uint64, len(mi.DocIDs)) 92 for i, tuple := range mi.DocIDs { 93 out[i] = tuple.DocID 94 } 95 96 return out 97 } 98 99 // Countable converts the merge item to a regular (non-merge) Countable. Note 100 // that this loses the IDs and Frequency information, so IDs have to be passed 101 // separately using .IDs() 102 func (mi MergeItem) Countable() Countable { 103 return Countable{ 104 Data: mi.Data, 105 } 106 } 107 108 type MergeDocIDWithFrequency struct { 109 DocID uint64 110 Frequency float32 111 } 112 113 type propsByName map[string]*propWithDocIDs 114 115 func (pbn propsByName) getOrCreate(name string) *propWithDocIDs { 116 prop, ok := pbn[name] 117 if ok { 118 return prop 119 } 120 prop = &propWithDocIDs{name: name, items: map[string]*countableWithDocIDs{}} 121 pbn[name] = prop 122 return prop 123 } 124 125 func (pbn propsByName) merge() []MergeProperty { 126 out := make([]MergeProperty, len(pbn)) 127 i := 0 128 for _, prop := range pbn { 129 mergedProp := prop.merge() 130 if mergedProp == nil { 131 continue 132 } 133 out[i] = *mergedProp 134 i++ 135 } 136 137 if i == 0 { 138 return nil 139 } 140 141 return out[:i] 142 } 143 144 type propWithDocIDs struct { 145 name string 146 items map[string]*countableWithDocIDs 147 hasFilterableIndex bool 148 hasSearchableIndex bool 149 } 150 151 func (pwd *propWithDocIDs) getOrCreateItem(data []byte) *countableWithDocIDs { 152 name := string(data) 153 item, ok := pwd.items[name] 154 if ok { 155 return item 156 } 157 item = &countableWithDocIDs{ 158 value: data, 159 docIDs: map[uint64]float32{}, 160 } 161 pwd.items[name] = item 162 return item 163 } 164 165 func (pwd *propWithDocIDs) merge() *MergeProperty { 166 items := make([]MergeItem, len(pwd.items)) 167 168 i := 0 169 for _, item := range pwd.items { 170 mergedItem := item.merge() 171 if mergedItem == nil { 172 continue 173 } 174 175 items[i] = *mergedItem 176 i++ 177 } 178 179 if i == 0 { 180 return nil 181 } 182 183 return &MergeProperty{ 184 Name: pwd.name, 185 MergeItems: items[:i], 186 HasFilterableIndex: pwd.hasFilterableIndex, 187 HasSearchableIndex: pwd.hasSearchableIndex, 188 } 189 } 190 191 type countableWithDocIDs struct { 192 value []byte 193 docIDs map[uint64]float32 // map[docid]frequency 194 } 195 196 func (cwd *countableWithDocIDs) addDocIDAndFrequency(docID uint64, freq float32) { 197 cwd.docIDs[docID] = freq 198 } 199 200 func (cwd *countableWithDocIDs) deleteIfPresent(docID uint64) bool { 201 _, ok := cwd.docIDs[docID] 202 if !ok { 203 return false 204 } 205 206 delete(cwd.docIDs, docID) 207 return true 208 } 209 210 func (cwd *countableWithDocIDs) merge() *MergeItem { 211 if len(cwd.docIDs) == 0 { 212 return nil 213 } 214 215 ids := make([]MergeDocIDWithFrequency, len(cwd.docIDs)) 216 i := 0 217 for docID, freq := range cwd.docIDs { 218 ids[i] = MergeDocIDWithFrequency{DocID: docID, Frequency: freq} 219 i++ 220 } 221 222 return &MergeItem{ 223 Data: cwd.value, 224 DocIDs: ids, 225 } 226 }