github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/m3ninx/index/segment/fst/encoding/docs/data.go (about)

     1  // Copyright (c) 2018 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Softwarw.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package docs
    22  
    23  import (
    24  	"errors"
    25  	"fmt"
    26  	"io"
    27  
    28  	"github.com/m3db/m3/src/m3ninx/doc"
    29  	"github.com/m3db/m3/src/m3ninx/index/segment/fst/encoding"
    30  )
    31  
    32  const initialDataEncoderLen = 1024
    33  
    34  // DataWriter writes the data file for documents.
    35  type DataWriter struct {
    36  	writer io.Writer
    37  	enc    *encoding.Encoder
    38  }
    39  
    40  // NewDataWriter returns a new DataWriter.
    41  func NewDataWriter(w io.Writer) *DataWriter {
    42  	return &DataWriter{
    43  		writer: w,
    44  		enc:    encoding.NewEncoder(initialDataEncoderLen),
    45  	}
    46  }
    47  
    48  func (w *DataWriter) Write(d doc.Metadata) (int, error) {
    49  	n := w.enc.PutBytes(d.ID)
    50  	n += w.enc.PutUvarint(uint64(len(d.Fields)))
    51  	for _, f := range d.Fields {
    52  		n += w.enc.PutBytes(f.Name)
    53  		n += w.enc.PutBytes(f.Value)
    54  	}
    55  
    56  	if err := w.write(); err != nil {
    57  		return 0, err
    58  	}
    59  
    60  	return n, nil
    61  }
    62  
    63  func (w *DataWriter) write() error {
    64  	b := w.enc.Bytes()
    65  	n, err := w.writer.Write(b)
    66  	if err != nil {
    67  		return err
    68  	}
    69  	if n < len(b) {
    70  		return io.ErrShortWrite
    71  	}
    72  	w.enc.Reset()
    73  	return nil
    74  }
    75  
    76  // Reset resets the DataWriter.
    77  func (w *DataWriter) Reset(wr io.Writer) {
    78  	w.writer = wr
    79  	w.enc.Reset()
    80  }
    81  
    82  // DataReader is a reader for the data file for documents.
    83  type DataReader struct {
    84  	data []byte
    85  }
    86  
    87  // NewDataReader returns a new DataReader.
    88  func NewDataReader(data []byte) *DataReader {
    89  	return &DataReader{
    90  		data: data,
    91  	}
    92  }
    93  
    94  func (r *DataReader) Read(offset uint64) (doc.Metadata, error) {
    95  	if offset >= uint64(len(r.data)) {
    96  		return doc.Metadata{}, fmt.Errorf("invalid offset: %v is past the end of the data file", offset)
    97  	}
    98  	dec := encoding.NewDecoder(r.data[int(offset):])
    99  	id, err := dec.Bytes()
   100  	if err != nil {
   101  		return doc.Metadata{}, err
   102  	}
   103  
   104  	x, err := dec.Uvarint()
   105  	if err != nil {
   106  		return doc.Metadata{}, err
   107  	}
   108  	n := int(x)
   109  
   110  	d := doc.Metadata{
   111  		ID:     id,
   112  		Fields: make([]doc.Field, n),
   113  	}
   114  
   115  	for i := 0; i < n; i++ {
   116  		name, err := dec.Bytes()
   117  		if err != nil {
   118  			return doc.Metadata{}, err
   119  		}
   120  		val, err := dec.Bytes()
   121  		if err != nil {
   122  			return doc.Metadata{}, err
   123  		}
   124  		d.Fields[i] = doc.Field{
   125  			Name:  name,
   126  			Value: val,
   127  		}
   128  	}
   129  
   130  	return d, nil
   131  }
   132  
   133  // EncodedDataReader is a reader for the data file for encoded document metadata.
   134  type EncodedDataReader struct {
   135  	data []byte
   136  }
   137  
   138  // NewEncodedDataReader returns a new EncodedDataReader.
   139  func NewEncodedDataReader(data []byte) *EncodedDataReader {
   140  	return &EncodedDataReader{
   141  		data: data,
   142  	}
   143  }
   144  
   145  // Read reads a doc.Encoded from a data stream starting at the specified offset.
   146  func (e *EncodedDataReader) Read(offset uint64) (doc.Encoded, error) {
   147  	if offset >= uint64(len(e.data)) {
   148  		return doc.Encoded{}, fmt.Errorf(
   149  			"invalid offset: %v is past the end of the data file", offset,
   150  		)
   151  	}
   152  
   153  	return doc.Encoded{
   154  		Bytes: e.data[int(offset):],
   155  	}, nil
   156  }
   157  
   158  // EncodedDocumentReader is a reader for reading documents from encoded metadata.
   159  type EncodedDocumentReader struct {
   160  	currFields []doc.Field
   161  }
   162  
   163  // NewEncodedDocumentReader returns a new EncodedDocumentReader.
   164  func NewEncodedDocumentReader() *EncodedDocumentReader {
   165  	return &EncodedDocumentReader{}
   166  }
   167  
   168  // Read reads a doc.Metadata from a doc.Encoded. Returned doc.Metadata should be
   169  // processed before calling Read again as the underlying array pointed to by the Fields
   170  // slice will be updated. This approach avoids allocating a new slice with a new backing
   171  // array for every document processed, unlike (*DataReader).Read
   172  func (r *EncodedDocumentReader) Read(encoded doc.Encoded) (doc.Metadata, error) {
   173  	for i := range r.currFields {
   174  		r.currFields[i] = doc.Field{}
   175  	}
   176  	r.currFields = r.currFields[:0]
   177  	id, buf, err := encoding.ReadBytes(encoded.Bytes)
   178  	if err != nil {
   179  		return doc.Metadata{}, err
   180  	}
   181  
   182  	x, buf, err := encoding.ReadUvarint(buf)
   183  	if err != nil {
   184  		return doc.Metadata{}, err
   185  	}
   186  	n := int(x)
   187  
   188  	var name, val []byte
   189  	for i := 0; i < n; i++ {
   190  		name, buf, err = encoding.ReadBytes(buf)
   191  		if err != nil {
   192  			return doc.Metadata{}, err
   193  		}
   194  		val, buf, err = encoding.ReadBytes(buf)
   195  		if err != nil {
   196  			return doc.Metadata{}, err
   197  		}
   198  		r.currFields = append(r.currFields, doc.Field{
   199  			Name:  name,
   200  			Value: val,
   201  		})
   202  	}
   203  
   204  	return doc.Metadata{
   205  		ID:     id,
   206  		Fields: r.currFields,
   207  	}, nil
   208  }
   209  
   210  // ReadEncodedDocumentID reads the document ID from the encoded document metadata.
   211  func ReadEncodedDocumentID(encoded doc.Encoded) ([]byte, error) {
   212  	id, _, err := encoding.ReadBytes(encoded.Bytes)
   213  	return id, err
   214  }
   215  
   216  // MetadataFromDocument retrieves a doc.Metadata from a doc.Document.
   217  func MetadataFromDocument(document doc.Document, reader *EncodedDocumentReader) (doc.Metadata, error) {
   218  	if d, ok := document.Metadata(); ok {
   219  		return d, nil
   220  	}
   221  
   222  	if e, ok := document.Encoded(); ok {
   223  		return reader.Read(e)
   224  	}
   225  
   226  	return doc.Metadata{}, errors.New("document does not contain metadata or encoded metadata")
   227  }
   228  
   229  // ReadIDFromDocument reads the document ID from the document.
   230  func ReadIDFromDocument(document doc.Document) ([]byte, error) {
   231  	if d, ok := document.Metadata(); ok {
   232  		return d.ID, nil
   233  	}
   234  
   235  	if e, ok := document.Encoded(); ok {
   236  		return ReadEncodedDocumentID(e)
   237  	}
   238  
   239  	return nil, errors.New("document does not contain metadata or encoded metadata")
   240  }