github.com/m3db/m3@v1.5.0/src/m3ninx/index/segment/builder/multi_segments_builder.go (about) 1 // Copyright (c) 2019 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package builder 22 23 import ( 24 "fmt" 25 "io" 26 "sort" 27 28 "github.com/m3db/m3/src/m3ninx/doc" 29 "github.com/m3db/m3/src/m3ninx/index" 30 "github.com/m3db/m3/src/m3ninx/index/segment" 31 "github.com/m3db/m3/src/m3ninx/postings" 32 xerrors "github.com/m3db/m3/src/x/errors" 33 ) 34 35 type builderFromSegments struct { 36 docs []doc.Metadata 37 idSet *IDsMap 38 filter segment.DocumentsFilter 39 segments []segmentMetadata 40 termsIter *termsIterFromSegments 41 segmentsOffset postings.ID 42 } 43 44 type segmentMetadata struct { 45 segment segment.Segment 46 offset postings.ID 47 // negativeOffsets is a lookup of document IDs are duplicates or should be skipped, 48 // that is documents that are already contained by other segments or should 49 // not be included in the output segment and hence should not be returned 50 // when looking up documents. If this is the case offset is -1. 51 // If a document ID is not a duplicate or skipped then the offset is 52 // the shift that should be applied when translating this postings ID 53 // to the result postings ID. 54 negativeOffsets []int64 55 skips int64 56 } 57 58 // NewBuilderFromSegments returns a new builder from segments. 59 func NewBuilderFromSegments(opts Options) segment.SegmentsBuilder { 60 return &builderFromSegments{ 61 idSet: NewIDsMap(IDsMapOptions{ 62 InitialSize: opts.InitialCapacity(), 63 }), 64 termsIter: newTermsIterFromSegments(), 65 } 66 } 67 68 func (b *builderFromSegments) Reset() { 69 // Reset the documents slice 70 var emptyDoc doc.Metadata 71 for i := range b.docs { 72 b.docs[i] = emptyDoc 73 } 74 b.docs = b.docs[:0] 75 76 // Reset all entries in ID set 77 b.idSet.Reset() 78 79 // Reset the segments metadata 80 b.segmentsOffset = 0 81 var emptySegment segmentMetadata 82 for i := range b.segments { 83 // Save the offsets array. 84 negativeOffsets := b.segments[i].negativeOffsets 85 b.segments[i] = emptySegment 86 b.segments[i].negativeOffsets = negativeOffsets[:0] 87 } 88 b.segments = b.segments[:0] 89 90 b.termsIter.clear() 91 } 92 93 func (b *builderFromSegments) SetFilter( 94 filter segment.DocumentsFilter, 95 ) { 96 b.filter = filter 97 } 98 99 func (b *builderFromSegments) AddSegments(segments []segment.Segment) error { 100 // Order by largest -> smallest so that the first segment 101 // is the largest when iterating over term postings lists 102 // (which means it can be directly copied into the merged postings 103 // list via a union rather than needing to shift posting list 104 // IDs to take into account for duplicates). 105 // Note: This must be done first so that offset is correctly zero 106 // for the largest segment. 107 sort.Slice(segments, func(i, j int) bool { 108 return segments[i].Size() > segments[j].Size() 109 }) 110 111 // numMaxDocs can sometimes be larger than the actual number of documents 112 // since some are duplicates 113 numMaxDocs := 0 114 for _, segment := range segments { 115 numMaxDocs += int(segment.Size()) 116 } 117 118 // Ensure we don't have to constantly reallocate docs slice 119 totalMaxSize := len(b.docs) + numMaxDocs 120 if cap(b.docs) < totalMaxSize { 121 b.docs = make([]doc.Metadata, 0, totalMaxSize) 122 } 123 124 // First build metadata and docs slice 125 for _, segment := range segments { 126 iter, closer, err := allDocsIter(segment) 127 if err != nil { 128 return err 129 } 130 131 var negativeOffsets []int64 132 if n := len(b.segments); cap(b.segments) > n { 133 // Take the offsets from the element we're about to reuse. 134 negativeOffsets = b.segments[:n+1][n].negativeOffsets[:0] 135 } 136 if int64(cap(negativeOffsets)) < segment.Size() { 137 negativeOffsets = make([]int64, 0, int(1.5*float64(segment.Size()))) 138 } 139 140 var ( 141 added int 142 currOffset int64 143 ) 144 for iter.Next() { 145 d := iter.Current() 146 negativeOffsets = append(negativeOffsets, currOffset) 147 if b.idSet.Contains(d.ID) { 148 // Skip duplicates. 149 negativeOffsets[len(negativeOffsets)-1] = -1 150 currOffset++ 151 if b.filter != nil { 152 // Callback for when duplicate doc encountered and we filter 153 // out the document from the resulting segment. 154 b.filter.OnDuplicateDoc(d) 155 } 156 continue 157 } 158 if b.filter != nil && !b.filter.ContainsDoc(d) { 159 // Actively filtering and ID is not contained. 160 negativeOffsets[len(negativeOffsets)-1] = -1 161 currOffset++ 162 continue 163 } 164 b.idSet.SetUnsafe(d.ID, struct{}{}, IDsMapSetUnsafeOptions{ 165 NoCopyKey: true, 166 NoFinalizeKey: true, 167 }) 168 b.docs = append(b.docs, d) 169 added++ 170 } 171 172 err = xerrors.FirstError(iter.Err(), iter.Close(), closer.Close()) 173 if err != nil { 174 return err 175 } 176 177 b.segments = append(b.segments, segmentMetadata{ 178 segment: segment, 179 offset: b.segmentsOffset, 180 negativeOffsets: negativeOffsets, 181 skips: currOffset, 182 }) 183 b.segmentsOffset += postings.ID(added) 184 } 185 186 // Make sure the terms iter has all the segments to combine data from 187 b.termsIter.reset(b.segments) 188 189 return nil 190 } 191 192 func (b *builderFromSegments) SegmentMetadatas() ([]segment.SegmentsBuilderSegmentMetadata, error) { 193 n := len(b.segments) 194 if n < 1 { 195 return nil, fmt.Errorf("segments empty: length=%d", n) 196 } 197 198 result := make([]segment.SegmentsBuilderSegmentMetadata, 0, n) 199 for _, s := range b.segments { 200 result = append(result, segment.SegmentsBuilderSegmentMetadata{ 201 Segment: s.segment, 202 Offset: s.offset, 203 NegativeOffsets: s.negativeOffsets, 204 Skips: s.skips, 205 }) 206 } 207 208 return result, nil 209 } 210 211 func (b *builderFromSegments) Docs() []doc.Metadata { 212 return b.docs 213 } 214 215 func (b *builderFromSegments) AllDocs() (index.IDDocIterator, error) { 216 rangeIter := postings.NewRangeIterator(0, postings.ID(len(b.docs))) 217 return index.NewIDDocIterator(b, rangeIter), nil 218 } 219 220 func (b *builderFromSegments) Metadata(id postings.ID) (doc.Metadata, error) { 221 idx := int(id) 222 if idx < 0 || idx >= len(b.docs) { 223 return doc.Metadata{}, errDocNotFound 224 } 225 226 return b.docs[idx], nil 227 } 228 229 func (b *builderFromSegments) NumDocs() (int, error) { 230 return len(b.docs), nil 231 } 232 233 func (b *builderFromSegments) FieldsIterable() segment.FieldsIterable { 234 return b 235 } 236 237 func (b *builderFromSegments) TermsIterable() segment.TermsIterable { 238 return b 239 } 240 241 func (b *builderFromSegments) Fields() (segment.FieldsIterator, error) { 242 return newFieldIterFromSegments(b.segments) 243 } 244 245 func (b *builderFromSegments) FieldsPostingsList() (segment.FieldsPostingsListIterator, error) { 246 return newFieldPostingsListIterFromSegments(b.segments) 247 } 248 249 func (b *builderFromSegments) Terms(field []byte) (segment.TermsIterator, error) { 250 if err := b.termsIter.setField(field); err != nil { 251 return nil, err 252 } 253 return b.termsIter, nil 254 } 255 256 func allDocsIter(seg segment.Segment) (index.IDDocIterator, io.Closer, error) { 257 reader, err := seg.Reader() 258 if err != nil { 259 return nil, nil, err 260 } 261 262 iter, err := reader.AllDocs() 263 if err != nil { 264 return nil, nil, err 265 } 266 267 return iter, reader, nil 268 }