github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/lsmkv/segment_precompute_for_compaction.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package lsmkv
    13  
    14  import (
    15  	"bytes"
    16  	"fmt"
    17  	"os"
    18  	"strings"
    19  
    20  	"github.com/edsrzf/mmap-go"
    21  	"github.com/pkg/errors"
    22  	"github.com/sirupsen/logrus"
    23  	"github.com/weaviate/weaviate/adapters/repos/db/lsmkv/segmentindex"
    24  )
    25  
    26  // preComputeSegmentMeta has no side-effects for an already running store. As a
    27  // result this can be run without the need to obtain any locks. All files
    28  // created will have a .tmp suffix so they don't interfere with existing
    29  // segments that might have a similar name.
    30  func preComputeSegmentMeta(path string, updatedCountNetAdditions int,
    31  	logger logrus.FieldLogger, useBloomFilter bool, calcCountNetAdditions bool,
    32  ) ([]string, error) {
    33  	out := []string{path}
    34  
    35  	// as a guardrail validate that the segment is considered a .tmp segment.
    36  	// This way we can be sure that we're not accidentally operating on a live
    37  	// segment as the segment group completely ignores .tmp segment files
    38  	if !strings.HasSuffix(path, ".tmp") {
    39  		return nil, fmt.Errorf("pre computing a segment expects a .tmp segment path")
    40  	}
    41  
    42  	file, err := os.Open(path)
    43  	if err != nil {
    44  		return nil, fmt.Errorf("open file: %w", err)
    45  	}
    46  	defer file.Close()
    47  
    48  	fileInfo, err := file.Stat()
    49  	if err != nil {
    50  		return nil, fmt.Errorf("stat file: %w", err)
    51  	}
    52  
    53  	contents, err := mmap.MapRegion(file, int(fileInfo.Size()), mmap.RDONLY, 0, 0)
    54  	if err != nil {
    55  		return nil, fmt.Errorf("mmap file: %w", err)
    56  	}
    57  
    58  	defer contents.Unmap()
    59  
    60  	header, err := segmentindex.ParseHeader(bytes.NewReader(contents[:segmentindex.HeaderSize]))
    61  	if err != nil {
    62  		return nil, fmt.Errorf("parse header: %w", err)
    63  	}
    64  
    65  	switch header.Strategy {
    66  	case segmentindex.StrategyReplace, segmentindex.StrategySetCollection,
    67  		segmentindex.StrategyMapCollection, segmentindex.StrategyRoaringSet:
    68  	default:
    69  		return nil, fmt.Errorf("unsupported strategy in segment")
    70  	}
    71  
    72  	primaryIndex, err := header.PrimaryIndex(contents)
    73  	if err != nil {
    74  		return nil, fmt.Errorf("extract primary index position: %w", err)
    75  	}
    76  
    77  	primaryDiskIndex := segmentindex.NewDiskTree(primaryIndex)
    78  
    79  	seg := &segment{
    80  		level: header.Level,
    81  		// trim the .tmp suffix to make sure the naming rules for the files we
    82  		// pre-compute later on still apply they will in turn be suffixed with
    83  		// .tmp, but that is supposed to be the end of the file. if we didn't trim
    84  		// the path here, we would end up with filenames like
    85  		// segment.tmp.bloom.tmp, whereas we want to end up with segment.bloom.tmp
    86  		path:                  strings.TrimSuffix(path, ".tmp"),
    87  		contents:              contents,
    88  		contentFile:           file,
    89  		version:               header.Version,
    90  		secondaryIndexCount:   header.SecondaryIndices,
    91  		segmentStartPos:       header.IndexStart,
    92  		segmentEndPos:         uint64(fileInfo.Size()),
    93  		strategy:              header.Strategy,
    94  		dataStartPos:          segmentindex.HeaderSize, // fixed value that's the same for all strategies
    95  		dataEndPos:            header.IndexStart,
    96  		index:                 primaryDiskIndex,
    97  		logger:                logger,
    98  		useBloomFilter:        useBloomFilter,
    99  		calcCountNetAdditions: calcCountNetAdditions,
   100  	}
   101  
   102  	if seg.secondaryIndexCount > 0 {
   103  		seg.secondaryIndices = make([]diskIndex, seg.secondaryIndexCount)
   104  		for i := range seg.secondaryIndices {
   105  			secondary, err := header.SecondaryIndex(contents, uint16(i))
   106  			if err != nil {
   107  				return nil, errors.Wrapf(err, "get position for secondary index at %d", i)
   108  			}
   109  			seg.secondaryIndices[i] = segmentindex.NewDiskTree(secondary)
   110  		}
   111  	}
   112  
   113  	if seg.useBloomFilter {
   114  		files, err := seg.precomputeBloomFilters()
   115  		if err != nil {
   116  			return nil, err
   117  		}
   118  		out = append(out, files...)
   119  	}
   120  	if seg.calcCountNetAdditions {
   121  		files, err := seg.precomputeCountNetAdditions(updatedCountNetAdditions)
   122  		if err != nil {
   123  			return nil, err
   124  		}
   125  		out = append(out, files...)
   126  	}
   127  
   128  	return out, nil
   129  }