github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/m3ninx/index/segment/fst/writer.go (about)

     1  // Copyright (c) 2018 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package fst
    22  
    23  import (
    24  	"fmt"
    25  	"io"
    26  
    27  	"github.com/m3db/m3/src/m3ninx/generated/proto/fswriter"
    28  	sgmt "github.com/m3db/m3/src/m3ninx/index/segment"
    29  	"github.com/m3db/m3/src/m3ninx/index/segment/fst/encoding"
    30  	"github.com/m3db/m3/src/m3ninx/postings"
    31  	"github.com/m3db/m3/src/m3ninx/postings/pilosa"
    32  	"github.com/m3db/m3/src/m3ninx/x"
    33  
    34  	"github.com/golang/protobuf/proto"
    35  )
    36  
    37  var (
    38  	defaultInitialPostingsOffsetsSize    = 1024
    39  	defaultInitialFSTTermsOffsetsSize    = 1024
    40  	defaultInitialDocOffsetsSize         = 1024
    41  	defaultInitialPostingsNeedsUnionSize = 1024
    42  	defaultInitialIntEncoderSize         = 128
    43  	defaultPilosaRoaringMaxContainerSize = 128
    44  )
    45  
    46  type writer struct {
    47  	version Version
    48  	builder sgmt.Builder
    49  	size    int64
    50  
    51  	intEncoder      *encoding.Encoder
    52  	postingsEncoder *pilosa.Encoder
    53  	fstWriter       *fstWriter
    54  	docsWriter      *DocumentsWriter
    55  
    56  	metadata            []byte
    57  	docsDataFileWritten bool
    58  	postingsFileWritten bool
    59  	fstTermsFileWritten bool
    60  	fstTermsOffsets     []uint64
    61  	termPostingsOffsets []uint64
    62  
    63  	// only used by versions >= 1.1
    64  	fieldPostingsOffsets []uint64
    65  	fieldData            *fswriter.FieldData
    66  	fieldBuffer          proto.Buffer
    67  }
    68  
    69  // WriterOptions is a set of options used when writing an FST.
    70  type WriterOptions struct {
    71  	// DisableRegistry disables the FST builder node registry cache which can
    72  	// de-duplicate transitions that are an exact match of each other during
    73  	// a final compilation phase, this helps compress the FST by a significant
    74  	// amount (e.g. 2x). You can disable this to speed up high fixed cost
    75  	// lookups to during building of the FST however.
    76  	DisableRegistry bool
    77  }
    78  
    79  // NewWriter returns a new writer.
    80  func NewWriter(opts WriterOptions) (Writer, error) {
    81  	return newWriterWithVersion(opts, nil)
    82  }
    83  
    84  // newWriterWithVersion is a constructor used by tests to override version.
    85  func newWriterWithVersion(opts WriterOptions, vers *Version) (Writer, error) {
    86  	v := CurrentVersion
    87  	if vers != nil {
    88  		v = *vers
    89  	}
    90  	if err := v.Supported(); err != nil {
    91  		return nil, err
    92  	}
    93  
    94  	docsWriter, err := NewDocumentsWriter()
    95  	if err != nil {
    96  		return nil, err
    97  	}
    98  
    99  	return &writer{
   100  		version:             v,
   101  		intEncoder:          encoding.NewEncoder(defaultInitialIntEncoderSize),
   102  		postingsEncoder:     pilosa.NewEncoder(),
   103  		fstWriter:           newFSTWriter(opts),
   104  		docsWriter:          docsWriter,
   105  		fstTermsOffsets:     make([]uint64, 0, defaultInitialFSTTermsOffsetsSize),
   106  		termPostingsOffsets: make([]uint64, 0, defaultInitialPostingsOffsetsSize),
   107  
   108  		fieldPostingsOffsets: make([]uint64, 0, defaultInitialPostingsOffsetsSize),
   109  		fieldData:            &fswriter.FieldData{},
   110  	}, nil
   111  }
   112  
   113  func (w *writer) clear() {
   114  	w.builder = nil
   115  
   116  	w.fstWriter.Reset(nil)
   117  	w.intEncoder.Reset()
   118  	w.postingsEncoder.Reset()
   119  	w.docsWriter.Reset(DocumentsWriterOptions{})
   120  
   121  	w.metadata = nil
   122  	w.docsDataFileWritten = false
   123  	w.postingsFileWritten = false
   124  	w.fstTermsFileWritten = false
   125  	w.fstTermsOffsets = w.fstTermsOffsets[:0]
   126  	w.termPostingsOffsets = w.termPostingsOffsets[:0]
   127  
   128  	w.fieldPostingsOffsets = w.fieldPostingsOffsets[:0]
   129  	w.fieldData.Reset()
   130  	w.fieldBuffer.Reset()
   131  }
   132  
   133  func (w *writer) Reset(b sgmt.Builder) error {
   134  	w.clear()
   135  
   136  	if b == nil {
   137  		return nil
   138  	}
   139  
   140  	numDocs := len(b.Docs())
   141  	metadata := defaultV1Metadata()
   142  	metadata.NumDocs = int64(numDocs)
   143  	metadataBytes, err := metadata.Marshal()
   144  	if err != nil {
   145  		return err
   146  	}
   147  
   148  	w.metadata = metadataBytes
   149  	w.builder = b
   150  	w.size = int64(numDocs)
   151  	return nil
   152  }
   153  
   154  func (w *writer) MajorVersion() int {
   155  	return w.version.Major
   156  }
   157  
   158  func (w *writer) MinorVersion() int {
   159  	return w.version.Minor
   160  }
   161  
   162  func (w *writer) Metadata() []byte {
   163  	return w.metadata
   164  }
   165  
   166  func (w *writer) WriteDocumentsData(iow io.Writer) error {
   167  	iter, err := w.builder.AllDocs()
   168  	closer := x.NewSafeCloser(iter)
   169  	defer closer.Close()
   170  	if err != nil {
   171  		return err
   172  	}
   173  
   174  	w.docsWriter.Reset(DocumentsWriterOptions{
   175  		Iter:     iter,
   176  		SizeHint: int(w.size),
   177  	})
   178  	if err := w.docsWriter.WriteDocumentsData(iow); err != nil {
   179  		return err
   180  	}
   181  
   182  	w.docsDataFileWritten = true
   183  	return closer.Close()
   184  }
   185  
   186  func (w *writer) WriteDocumentsIndex(iow io.Writer) error {
   187  	if !w.docsDataFileWritten {
   188  		return fmt.Errorf("documents data file has to be written before documents index file")
   189  	}
   190  
   191  	return w.docsWriter.WriteDocumentsIndex(iow)
   192  }
   193  
   194  func (w *writer) WritePostingsOffsets(iow io.Writer) error {
   195  	var (
   196  		writeFieldsPostingList = w.version.supportsFieldPostingsList()
   197  		currentOffset          = uint64(0)
   198  	)
   199  	writePL := func(pl postings.List) (uint64, error) { // helper method
   200  		// serialize the postings list
   201  		w.postingsEncoder.Reset()
   202  		postingsBytes, err := w.postingsEncoder.Encode(pl)
   203  		if err != nil {
   204  			return 0, err
   205  		}
   206  		return w.writePayloadAndSizeAndMagicNumber(iow, postingsBytes)
   207  	}
   208  
   209  	// retrieve known fields
   210  	fields, err := w.builder.FieldsPostingsList()
   211  	if err != nil {
   212  		return err
   213  	}
   214  
   215  	// for each known field
   216  	for fields.Next() {
   217  		f, fieldPostingsList := fields.Current()
   218  		// retrieve known terms for current field
   219  		terms, err := w.builder.Terms(f)
   220  		if err != nil {
   221  			return err
   222  		}
   223  
   224  		// for each term corresponding to the current field
   225  		for terms.Next() {
   226  			_, pl := terms.Current()
   227  			// write the postings list
   228  			n, err := writePL(pl)
   229  			if err != nil {
   230  				return err
   231  			}
   232  			// update offset with the number of bytes we've written
   233  			currentOffset += n
   234  			// track current offset as the offset for the current field/term
   235  			w.termPostingsOffsets = append(w.termPostingsOffsets, currentOffset)
   236  		}
   237  
   238  		// write the field level postings list
   239  		if writeFieldsPostingList {
   240  			// Write the unioned postings list out.
   241  			n, err := writePL(fieldPostingsList)
   242  			if err != nil {
   243  				return err
   244  			}
   245  			// update offset with the number of bytes we've written
   246  			currentOffset += n
   247  			// track current offset as the offset for the current field
   248  			w.fieldPostingsOffsets = append(w.fieldPostingsOffsets, currentOffset)
   249  		}
   250  
   251  		if err := terms.Err(); err != nil {
   252  			return err
   253  		}
   254  
   255  		if err := terms.Close(); err != nil {
   256  			return err
   257  		}
   258  	}
   259  
   260  	if err := fields.Err(); err != nil {
   261  		return err
   262  	}
   263  
   264  	if err := fields.Close(); err != nil {
   265  		return err
   266  	}
   267  
   268  	w.postingsFileWritten = true
   269  	return nil
   270  }
   271  
   272  func (w *writer) WriteFSTTerms(iow io.Writer) error {
   273  	if !w.postingsFileWritten {
   274  		return fmt.Errorf("postings offsets have to be written before fst terms can be written")
   275  	}
   276  
   277  	var (
   278  		writeFieldsPostingList = w.version.supportsFieldPostingsList()
   279  		currentOffset          = uint64(0) // track offset of writes into `iow`.
   280  	)
   281  
   282  	// retrieve all known fields
   283  	fields, err := w.builder.FieldsPostingsList()
   284  	if err != nil {
   285  		return err
   286  	}
   287  
   288  	// iterate term|field postings offsets
   289  	var (
   290  		termOffsets  = w.termPostingsOffsets
   291  		fieldOffsets = w.fieldPostingsOffsets
   292  	)
   293  
   294  	// build a fst for each field's terms
   295  	for fields.Next() {
   296  		f, _ := fields.Current()
   297  
   298  		// write fields level postings list if required
   299  		if writeFieldsPostingList {
   300  			po := fieldOffsets[0]
   301  			fieldOffsets = fieldOffsets[1:]
   302  			md, err := w.fieldsMetadata(po)
   303  			if err != nil {
   304  				return err
   305  			}
   306  			if _, err := iow.Write(md); err != nil {
   307  				return err
   308  			}
   309  			numBytesMD := uint64(len(md))
   310  			numBytesMDSize, err := w.writeUint64(iow, numBytesMD)
   311  			if err != nil {
   312  				return err
   313  			}
   314  			currentOffset += numBytesMD + numBytesMDSize
   315  		}
   316  
   317  		// reset writer for this field's fst
   318  		if err := w.fstWriter.Reset(iow); err != nil {
   319  			return err
   320  		}
   321  
   322  		// retrieve all terms for this field
   323  		terms, err := w.builder.Terms(f)
   324  		if err != nil {
   325  			return err
   326  		}
   327  
   328  		// for each term corresponding to this field
   329  		for terms.Next() {
   330  			t, _ := terms.Current()
   331  
   332  			// retieve postsings offset for the current field,term
   333  			if len(termOffsets) == 0 {
   334  				return fmt.Errorf("postings offset not found for: field=%s, term=%s", f, t)
   335  			}
   336  
   337  			po := termOffsets[0]
   338  			termOffsets = termOffsets[1:]
   339  
   340  			// add the term -> posting offset into the term's fst
   341  			if err := w.fstWriter.Add(t, po); err != nil {
   342  				return err
   343  			}
   344  		}
   345  		if err := terms.Err(); err != nil {
   346  			return err
   347  		}
   348  
   349  		if err := terms.Close(); err != nil {
   350  			return err
   351  		}
   352  
   353  		// retrieve a serialized representation of the field's fst
   354  		numBytesFST, err := w.fstWriter.Close()
   355  		if err != nil {
   356  			return err
   357  		}
   358  
   359  		// serialize the size of the fst
   360  		n, err := w.writeSizeAndMagicNumber(iow, numBytesFST)
   361  		if err != nil {
   362  			return err
   363  		}
   364  		// update offset with the number of bytes we've written
   365  		currentOffset += numBytesFST + n
   366  
   367  		// track current offset as the offset for the current field's fst
   368  		w.fstTermsOffsets = append(w.fstTermsOffsets, currentOffset)
   369  	}
   370  
   371  	if err := fields.Err(); err != nil {
   372  		return err
   373  	}
   374  
   375  	if err := fields.Close(); err != nil {
   376  		return err
   377  	}
   378  
   379  	// make sure we consumed all the postings offsets
   380  	if len(termOffsets) != 0 {
   381  		return fmt.Errorf("term postings offsets remain at end of terms: remaining=%d",
   382  			len(termOffsets))
   383  	}
   384  
   385  	// make sure we consumed all the postings offsets
   386  	if len(fieldOffsets) != 0 {
   387  		return fmt.Errorf("field postings offsets remain at end of terms: remaining=%d",
   388  			len(fieldOffsets))
   389  	}
   390  
   391  	// all good!
   392  	w.fstTermsFileWritten = true
   393  	return nil
   394  }
   395  
   396  func (w *writer) fieldsMetadata(fieldPostingsOffset uint64) ([]byte, error) {
   397  	w.fieldBuffer.Reset()
   398  	w.fieldData.FieldPostingsListOffset = fieldPostingsOffset
   399  	if err := w.fieldBuffer.Marshal(w.fieldData); err != nil {
   400  		return nil, err
   401  	}
   402  	return w.fieldBuffer.Bytes(), nil
   403  }
   404  
   405  func (w *writer) WriteFSTFields(iow io.Writer) error {
   406  	if !w.fstTermsFileWritten {
   407  		return fmt.Errorf("fst terms files have to be written before fst fields can be written")
   408  	}
   409  
   410  	// reset fst writer
   411  	if err := w.fstWriter.Reset(iow); err != nil {
   412  		return err
   413  	}
   414  
   415  	// iterate field offsets
   416  	offsets := w.fstTermsOffsets
   417  
   418  	// retrieve all known fields
   419  	fields, err := w.builder.FieldsPostingsList()
   420  	if err != nil {
   421  		return err
   422  	}
   423  
   424  	// insert each field into fst
   425  	for fields.Next() {
   426  		f, _ := fields.Current()
   427  
   428  		// get offset for this field's term fst
   429  		if len(offsets) == 0 {
   430  			return fmt.Errorf("fst field offset not found for: field=%s", f)
   431  		}
   432  
   433  		offset := offsets[0]
   434  		offsets = offsets[1:]
   435  
   436  		// add field, offset into fst
   437  		if err := w.fstWriter.Add(f, offset); err != nil {
   438  			return err
   439  		}
   440  	}
   441  
   442  	if err := fields.Err(); err != nil {
   443  		return err
   444  	}
   445  
   446  	if err := fields.Close(); err != nil {
   447  		return err
   448  	}
   449  
   450  	// flush the fst writer
   451  	_, err = w.fstWriter.Close()
   452  
   453  	// make sure we consumed all the postings offsets
   454  	if len(offsets) != 0 {
   455  		return fmt.Errorf("field offsets remain at end of fields: remaining=%d",
   456  			len(offsets))
   457  	}
   458  
   459  	return err
   460  }
   461  
   462  // given a payload []byte, and io.Writer; this method writes the following data out to the writer
   463  // | payload - len(payload) bytes | 8 bytes for uint64 (size of payload) | 8 bytes for `magicNumber` |
   464  func (w *writer) writePayloadAndSizeAndMagicNumber(iow io.Writer, payload []byte) (uint64, error) {
   465  	numBytesWritten := uint64(0)
   466  	size, err := iow.Write(payload)
   467  	if err != nil {
   468  		return 0, err
   469  	}
   470  	numBytesWritten += uint64(size)
   471  	n, err := w.writeSizeAndMagicNumber(iow, uint64(size))
   472  	if err != nil {
   473  		return 0, err
   474  	}
   475  	numBytesWritten += n
   476  	return numBytesWritten, nil
   477  }
   478  
   479  func (w *writer) writeUint64(iow io.Writer, x uint64) (uint64, error) {
   480  	// serialize the size, magicNumber
   481  	w.intEncoder.Reset()
   482  	w.intEncoder.PutUint64(x)
   483  	xBytes := w.intEncoder.Bytes()
   484  
   485  	// write out the size
   486  	n, err := iow.Write(xBytes)
   487  	if err != nil {
   488  		return 0, err
   489  	}
   490  	return uint64(n), nil
   491  }
   492  
   493  func (w *writer) writeSizeAndMagicNumber(iow io.Writer, size uint64) (uint64, error) {
   494  	// serialize the size, magicNumber
   495  	w.intEncoder.Reset()
   496  	w.intEncoder.PutUint64(size)
   497  	w.intEncoder.PutUint64(uint64(magicNumber))
   498  	sizeBytes := w.intEncoder.Bytes()
   499  
   500  	// write out the size
   501  	n, err := iow.Write(sizeBytes)
   502  	if err != nil {
   503  		return 0, err
   504  	}
   505  	return uint64(n), nil
   506  }
   507  
   508  func defaultV1Metadata() fswriter.Metadata {
   509  	return fswriter.Metadata{
   510  		PostingsFormat: fswriter.PostingsFormat_PILOSAV1_POSTINGS_FORMAT,
   511  	}
   512  }
   513  
   514  type docOffset struct {
   515  	postings.ID
   516  	offset uint64
   517  }