github.com/m3db/m3@v1.5.0/src/m3ninx/index/segment/builder/multi_segments_builder.go (about)

     1  // Copyright (c) 2019 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package builder
    22  
    23  import (
    24  	"fmt"
    25  	"io"
    26  	"sort"
    27  
    28  	"github.com/m3db/m3/src/m3ninx/doc"
    29  	"github.com/m3db/m3/src/m3ninx/index"
    30  	"github.com/m3db/m3/src/m3ninx/index/segment"
    31  	"github.com/m3db/m3/src/m3ninx/postings"
    32  	xerrors "github.com/m3db/m3/src/x/errors"
    33  )
    34  
    35  type builderFromSegments struct {
    36  	docs           []doc.Metadata
    37  	idSet          *IDsMap
    38  	filter         segment.DocumentsFilter
    39  	segments       []segmentMetadata
    40  	termsIter      *termsIterFromSegments
    41  	segmentsOffset postings.ID
    42  }
    43  
    44  type segmentMetadata struct {
    45  	segment segment.Segment
    46  	offset  postings.ID
    47  	// negativeOffsets is a lookup of document IDs are duplicates or should be skipped,
    48  	// that is documents that are already contained by other segments or should
    49  	// not be included in the output segment and hence should not be returned
    50  	// when looking up documents. If this is the case offset is -1.
    51  	// If a document ID is not a duplicate or skipped then the offset is
    52  	// the shift that should be applied when translating this postings ID
    53  	// to the result postings ID.
    54  	negativeOffsets []int64
    55  	skips           int64
    56  }
    57  
    58  // NewBuilderFromSegments returns a new builder from segments.
    59  func NewBuilderFromSegments(opts Options) segment.SegmentsBuilder {
    60  	return &builderFromSegments{
    61  		idSet: NewIDsMap(IDsMapOptions{
    62  			InitialSize: opts.InitialCapacity(),
    63  		}),
    64  		termsIter: newTermsIterFromSegments(),
    65  	}
    66  }
    67  
    68  func (b *builderFromSegments) Reset() {
    69  	// Reset the documents slice
    70  	var emptyDoc doc.Metadata
    71  	for i := range b.docs {
    72  		b.docs[i] = emptyDoc
    73  	}
    74  	b.docs = b.docs[:0]
    75  
    76  	// Reset all entries in ID set
    77  	b.idSet.Reset()
    78  
    79  	// Reset the segments metadata
    80  	b.segmentsOffset = 0
    81  	var emptySegment segmentMetadata
    82  	for i := range b.segments {
    83  		// Save the offsets array.
    84  		negativeOffsets := b.segments[i].negativeOffsets
    85  		b.segments[i] = emptySegment
    86  		b.segments[i].negativeOffsets = negativeOffsets[:0]
    87  	}
    88  	b.segments = b.segments[:0]
    89  
    90  	b.termsIter.clear()
    91  }
    92  
    93  func (b *builderFromSegments) SetFilter(
    94  	filter segment.DocumentsFilter,
    95  ) {
    96  	b.filter = filter
    97  }
    98  
    99  func (b *builderFromSegments) AddSegments(segments []segment.Segment) error {
   100  	// Order by largest -> smallest so that the first segment
   101  	// is the largest when iterating over term postings lists
   102  	// (which means it can be directly copied into the merged postings
   103  	// list via a union rather than needing to shift posting list
   104  	// IDs to take into account for duplicates).
   105  	// Note: This must be done first so that offset is correctly zero
   106  	// for the largest segment.
   107  	sort.Slice(segments, func(i, j int) bool {
   108  		return segments[i].Size() > segments[j].Size()
   109  	})
   110  
   111  	// numMaxDocs can sometimes be larger than the actual number of documents
   112  	// since some are duplicates
   113  	numMaxDocs := 0
   114  	for _, segment := range segments {
   115  		numMaxDocs += int(segment.Size())
   116  	}
   117  
   118  	// Ensure we don't have to constantly reallocate docs slice
   119  	totalMaxSize := len(b.docs) + numMaxDocs
   120  	if cap(b.docs) < totalMaxSize {
   121  		b.docs = make([]doc.Metadata, 0, totalMaxSize)
   122  	}
   123  
   124  	// First build metadata and docs slice
   125  	for _, segment := range segments {
   126  		iter, closer, err := allDocsIter(segment)
   127  		if err != nil {
   128  			return err
   129  		}
   130  
   131  		var negativeOffsets []int64
   132  		if n := len(b.segments); cap(b.segments) > n {
   133  			// Take the offsets from the element we're about to reuse.
   134  			negativeOffsets = b.segments[:n+1][n].negativeOffsets[:0]
   135  		}
   136  		if int64(cap(negativeOffsets)) < segment.Size() {
   137  			negativeOffsets = make([]int64, 0, int(1.5*float64(segment.Size())))
   138  		}
   139  
   140  		var (
   141  			added      int
   142  			currOffset int64
   143  		)
   144  		for iter.Next() {
   145  			d := iter.Current()
   146  			negativeOffsets = append(negativeOffsets, currOffset)
   147  			if b.idSet.Contains(d.ID) {
   148  				// Skip duplicates.
   149  				negativeOffsets[len(negativeOffsets)-1] = -1
   150  				currOffset++
   151  				if b.filter != nil {
   152  					// Callback for when duplicate doc encountered and we filter
   153  					// out the document from the resulting segment.
   154  					b.filter.OnDuplicateDoc(d)
   155  				}
   156  				continue
   157  			}
   158  			if b.filter != nil && !b.filter.ContainsDoc(d) {
   159  				// Actively filtering and ID is not contained.
   160  				negativeOffsets[len(negativeOffsets)-1] = -1
   161  				currOffset++
   162  				continue
   163  			}
   164  			b.idSet.SetUnsafe(d.ID, struct{}{}, IDsMapSetUnsafeOptions{
   165  				NoCopyKey:     true,
   166  				NoFinalizeKey: true,
   167  			})
   168  			b.docs = append(b.docs, d)
   169  			added++
   170  		}
   171  
   172  		err = xerrors.FirstError(iter.Err(), iter.Close(), closer.Close())
   173  		if err != nil {
   174  			return err
   175  		}
   176  
   177  		b.segments = append(b.segments, segmentMetadata{
   178  			segment:         segment,
   179  			offset:          b.segmentsOffset,
   180  			negativeOffsets: negativeOffsets,
   181  			skips:           currOffset,
   182  		})
   183  		b.segmentsOffset += postings.ID(added)
   184  	}
   185  
   186  	// Make sure the terms iter has all the segments to combine data from
   187  	b.termsIter.reset(b.segments)
   188  
   189  	return nil
   190  }
   191  
   192  func (b *builderFromSegments) SegmentMetadatas() ([]segment.SegmentsBuilderSegmentMetadata, error) {
   193  	n := len(b.segments)
   194  	if n < 1 {
   195  		return nil, fmt.Errorf("segments empty: length=%d", n)
   196  	}
   197  
   198  	result := make([]segment.SegmentsBuilderSegmentMetadata, 0, n)
   199  	for _, s := range b.segments {
   200  		result = append(result, segment.SegmentsBuilderSegmentMetadata{
   201  			Segment:         s.segment,
   202  			Offset:          s.offset,
   203  			NegativeOffsets: s.negativeOffsets,
   204  			Skips:           s.skips,
   205  		})
   206  	}
   207  
   208  	return result, nil
   209  }
   210  
   211  func (b *builderFromSegments) Docs() []doc.Metadata {
   212  	return b.docs
   213  }
   214  
   215  func (b *builderFromSegments) AllDocs() (index.IDDocIterator, error) {
   216  	rangeIter := postings.NewRangeIterator(0, postings.ID(len(b.docs)))
   217  	return index.NewIDDocIterator(b, rangeIter), nil
   218  }
   219  
   220  func (b *builderFromSegments) Metadata(id postings.ID) (doc.Metadata, error) {
   221  	idx := int(id)
   222  	if idx < 0 || idx >= len(b.docs) {
   223  		return doc.Metadata{}, errDocNotFound
   224  	}
   225  
   226  	return b.docs[idx], nil
   227  }
   228  
   229  func (b *builderFromSegments) NumDocs() (int, error) {
   230  	return len(b.docs), nil
   231  }
   232  
   233  func (b *builderFromSegments) FieldsIterable() segment.FieldsIterable {
   234  	return b
   235  }
   236  
   237  func (b *builderFromSegments) TermsIterable() segment.TermsIterable {
   238  	return b
   239  }
   240  
   241  func (b *builderFromSegments) Fields() (segment.FieldsIterator, error) {
   242  	return newFieldIterFromSegments(b.segments)
   243  }
   244  
   245  func (b *builderFromSegments) FieldsPostingsList() (segment.FieldsPostingsListIterator, error) {
   246  	return newFieldPostingsListIterFromSegments(b.segments)
   247  }
   248  
   249  func (b *builderFromSegments) Terms(field []byte) (segment.TermsIterator, error) {
   250  	if err := b.termsIter.setField(field); err != nil {
   251  		return nil, err
   252  	}
   253  	return b.termsIter, nil
   254  }
   255  
   256  func allDocsIter(seg segment.Segment) (index.IDDocIterator, io.Closer, error) {
   257  	reader, err := seg.Reader()
   258  	if err != nil {
   259  		return nil, nil, err
   260  	}
   261  
   262  	iter, err := reader.AllDocs()
   263  	if err != nil {
   264  		return nil, nil, err
   265  	}
   266  
   267  	return iter, reader, nil
   268  }