github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/lsmkv/segment_group.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package lsmkv
    13  
    14  import (
    15  	"context"
    16  	"errors"
    17  	"fmt"
    18  	"io/fs"
    19  	"os"
    20  	"path/filepath"
    21  	"strings"
    22  	"sync"
    23  
    24  	"github.com/sirupsen/logrus"
    25  	"github.com/weaviate/weaviate/adapters/repos/db/roaringset"
    26  	"github.com/weaviate/weaviate/entities/cyclemanager"
    27  	"github.com/weaviate/weaviate/entities/lsmkv"
    28  	"github.com/weaviate/weaviate/entities/storagestate"
    29  )
    30  
    31  type SegmentGroup struct {
    32  	segments []*segment
    33  
    34  	// Lock() for changing the currently active segments, RLock() for normal
    35  	// operation
    36  	maintenanceLock sync.RWMutex
    37  	dir             string
    38  
    39  	strategy string
    40  
    41  	compactionCallbackCtrl cyclemanager.CycleCallbackCtrl
    42  
    43  	logger logrus.FieldLogger
    44  
    45  	// for backward-compatibility with states where the disk state for maps was
    46  	// not guaranteed to be sorted yet
    47  	mapRequiresSorting bool
    48  
    49  	status     storagestate.Status
    50  	statusLock sync.Mutex
    51  	metrics    *Metrics
    52  
    53  	// all "replace" buckets support counting through net additions, but not all
    54  	// produce a meaningful count. Typically, the only count we're interested in
    55  	// is that of the bucket that holds objects
    56  	monitorCount bool
    57  
    58  	mmapContents            bool
    59  	keepTombstones          bool // see bucket for more datails
    60  	useBloomFilter          bool // see bucket for more datails
    61  	calcCountNetAdditions   bool // see bucket for more datails
    62  	compactLeftOverSegments bool // see bucket for more datails
    63  }
    64  
    65  type sgConfig struct {
    66  	dir                   string
    67  	strategy              string
    68  	mapRequiresSorting    bool
    69  	monitorCount          bool
    70  	mmapContents          bool
    71  	keepTombstones        bool
    72  	useBloomFilter        bool
    73  	calcCountNetAdditions bool
    74  	forceCompaction       bool
    75  }
    76  
    77  func newSegmentGroup(logger logrus.FieldLogger, metrics *Metrics,
    78  	compactionCallbacks cyclemanager.CycleCallbackGroup, cfg sgConfig,
    79  ) (*SegmentGroup, error) {
    80  	list, err := os.ReadDir(cfg.dir)
    81  	if err != nil {
    82  		return nil, err
    83  	}
    84  
    85  	sg := &SegmentGroup{
    86  		segments:                make([]*segment, len(list)),
    87  		dir:                     cfg.dir,
    88  		logger:                  logger,
    89  		metrics:                 metrics,
    90  		monitorCount:            cfg.monitorCount,
    91  		mapRequiresSorting:      cfg.mapRequiresSorting,
    92  		strategy:                cfg.strategy,
    93  		mmapContents:            cfg.mmapContents,
    94  		keepTombstones:          cfg.keepTombstones,
    95  		useBloomFilter:          cfg.useBloomFilter,
    96  		calcCountNetAdditions:   cfg.calcCountNetAdditions,
    97  		compactLeftOverSegments: cfg.forceCompaction,
    98  	}
    99  
   100  	segmentIndex := 0
   101  
   102  	segmentsAlreadyRecoveredFromCompaction := make(map[string]struct{})
   103  
   104  	// Note: it's important to process first the compacted segments
   105  	// TODO: a single iteration may be possible
   106  
   107  	for _, entry := range list {
   108  		if filepath.Ext(entry.Name()) != ".tmp" {
   109  			continue
   110  		}
   111  
   112  		potentialCompactedSegmentFileName := strings.TrimSuffix(entry.Name(), ".tmp")
   113  
   114  		if filepath.Ext(potentialCompactedSegmentFileName) != ".db" {
   115  			// another kind of temporal file, ignore at this point but it may need to be deleted...
   116  			continue
   117  		}
   118  
   119  		jointSegments := segmentID(potentialCompactedSegmentFileName)
   120  		jointSegmentsIDs := strings.Split(jointSegments, "_")
   121  
   122  		if len(jointSegmentsIDs) != 2 {
   123  			return nil, fmt.Errorf("invalid compacted segment file name %q", entry.Name())
   124  		}
   125  
   126  		leftSegmentFilename := fmt.Sprintf("segment-%s.db", jointSegmentsIDs[0])
   127  		rightSegmentFilename := fmt.Sprintf("segment-%s.db", jointSegmentsIDs[1])
   128  
   129  		leftSegmentPath := filepath.Join(sg.dir, leftSegmentFilename)
   130  		rightSegmentPath := filepath.Join(sg.dir, rightSegmentFilename)
   131  
   132  		leftSegmentFound, err := fileExists(leftSegmentPath)
   133  		if err != nil {
   134  			return nil, fmt.Errorf("check for presence of segment %s: %w", leftSegmentFilename, err)
   135  		}
   136  
   137  		rightSegmentFound, err := fileExists(rightSegmentPath)
   138  		if err != nil {
   139  			return nil, fmt.Errorf("check for presence of segment %s: %w", rightSegmentFilename, err)
   140  		}
   141  
   142  		if leftSegmentFound && rightSegmentFound {
   143  			if err := os.Remove(filepath.Join(sg.dir, entry.Name())); err != nil {
   144  				return nil, fmt.Errorf("delete partially compacted segment %q: %w", entry.Name(), err)
   145  			}
   146  			continue
   147  		}
   148  
   149  		if leftSegmentFound && !rightSegmentFound {
   150  			return nil, fmt.Errorf("missing right segment %q", rightSegmentFilename)
   151  		}
   152  
   153  		if !leftSegmentFound && rightSegmentFound {
   154  			rightSegment, err := newSegment(rightSegmentPath, logger,
   155  				metrics, sg.makeExistsOnLower(segmentIndex),
   156  				sg.mmapContents, sg.useBloomFilter, sg.calcCountNetAdditions, true)
   157  			if err != nil {
   158  				return nil, fmt.Errorf("init segment %s: %w", rightSegmentFilename, err)
   159  			}
   160  
   161  			err = rightSegment.drop()
   162  			if err != nil {
   163  				return nil, fmt.Errorf("delete already compacted right segment %s: %w", rightSegmentFilename, err)
   164  			}
   165  		}
   166  
   167  		if err := os.Rename(filepath.Join(sg.dir, entry.Name()), rightSegmentPath); err != nil {
   168  			return nil, fmt.Errorf("rename compacted segment file %q as %q: %w", entry.Name(), rightSegmentFilename, err)
   169  		}
   170  
   171  		segment, err := newSegment(rightSegmentPath, logger,
   172  			metrics, sg.makeExistsOnLower(segmentIndex),
   173  			sg.mmapContents, sg.useBloomFilter, sg.calcCountNetAdditions, true)
   174  		if err != nil {
   175  			return nil, fmt.Errorf("init segment %s: %w", rightSegmentFilename, err)
   176  		}
   177  
   178  		sg.segments[segmentIndex] = segment
   179  		segmentIndex++
   180  
   181  		segmentsAlreadyRecoveredFromCompaction[rightSegmentFilename] = struct{}{}
   182  	}
   183  
   184  	for _, entry := range list {
   185  		if filepath.Ext(entry.Name()) != ".db" {
   186  			// skip, this could be commit log, etc.
   187  			continue
   188  		}
   189  
   190  		_, alreadyRecoveredFromCompaction := segmentsAlreadyRecoveredFromCompaction[entry.Name()]
   191  		if alreadyRecoveredFromCompaction {
   192  			// the .db file was already removed and restored from a compacted segment
   193  			continue
   194  		}
   195  
   196  		// before we can mount this file, we need to check if a WAL exists for it.
   197  		// If yes, we must assume that the flush never finished, as otherwise the
   198  		// WAL would have been lsmkv.Deleted. Thus we must remove it.
   199  		walFileName := strings.TrimSuffix(entry.Name(), ".db") + ".wal"
   200  		ok, err := fileExists(filepath.Join(sg.dir, walFileName))
   201  		if err != nil {
   202  			return nil, fmt.Errorf("check for presence of wals for segment %s: %w",
   203  				entry.Name(), err)
   204  		}
   205  		if ok {
   206  			// the segment will be recovered from the WAL
   207  			err := os.Remove(filepath.Join(sg.dir, entry.Name()))
   208  			if err != nil {
   209  				return nil, fmt.Errorf("delete partially written segment %s: %w", entry.Name(), err)
   210  			}
   211  
   212  			logger.WithField("action", "lsm_segment_init").
   213  				WithField("path", filepath.Join(sg.dir, entry.Name())).
   214  				WithField("wal_path", walFileName).
   215  				Info("Discarded (partially written) LSM segment, because an active WAL for " +
   216  					"the same segment was found. A recovery from the WAL will follow.")
   217  
   218  			continue
   219  		}
   220  
   221  		segment, err := newSegment(filepath.Join(sg.dir, entry.Name()), logger,
   222  			metrics, sg.makeExistsOnLower(segmentIndex),
   223  			sg.mmapContents, sg.useBloomFilter, sg.calcCountNetAdditions, false)
   224  		if err != nil {
   225  			return nil, fmt.Errorf("init segment %s: %w", entry.Name(), err)
   226  		}
   227  
   228  		sg.segments[segmentIndex] = segment
   229  		segmentIndex++
   230  	}
   231  
   232  	sg.segments = sg.segments[:segmentIndex]
   233  
   234  	if sg.monitorCount {
   235  		sg.metrics.ObjectCount(sg.count())
   236  	}
   237  
   238  	id := "segmentgroup/compaction/" + sg.dir
   239  	sg.compactionCallbackCtrl = compactionCallbacks.Register(id, sg.compactIfLevelsMatch)
   240  
   241  	return sg, nil
   242  }
   243  
   244  func (sg *SegmentGroup) makeExistsOnLower(nextSegmentIndex int) existsOnLowerSegmentsFn {
   245  	return func(key []byte) (bool, error) {
   246  		if nextSegmentIndex == 0 {
   247  			// this is already the lowest possible segment, we can guarantee that
   248  			// any key in this segment is previously unseen.
   249  			return false, nil
   250  		}
   251  
   252  		v, err := sg.getWithUpperSegmentBoundary(key, nextSegmentIndex-1)
   253  		if err != nil {
   254  			return false, fmt.Errorf("check exists on segments lower than %d: %w",
   255  				nextSegmentIndex, err)
   256  		}
   257  
   258  		return v != nil, nil
   259  	}
   260  }
   261  
   262  func (sg *SegmentGroup) add(path string) error {
   263  	sg.maintenanceLock.Lock()
   264  	defer sg.maintenanceLock.Unlock()
   265  
   266  	newSegmentIndex := len(sg.segments)
   267  	segment, err := newSegment(path, sg.logger,
   268  		sg.metrics, sg.makeExistsOnLower(newSegmentIndex),
   269  		sg.mmapContents, sg.useBloomFilter, sg.calcCountNetAdditions, true)
   270  	if err != nil {
   271  		return fmt.Errorf("init segment %s: %w", path, err)
   272  	}
   273  
   274  	sg.segments = append(sg.segments, segment)
   275  	return nil
   276  }
   277  
   278  func (sg *SegmentGroup) get(key []byte) ([]byte, error) {
   279  	sg.maintenanceLock.RLock()
   280  	defer sg.maintenanceLock.RUnlock()
   281  
   282  	return sg.getWithUpperSegmentBoundary(key, len(sg.segments)-1)
   283  }
   284  
   285  // not thread-safe on its own, as the assumption is that this is called from a
   286  // lockholder, e.g. within .get()
   287  func (sg *SegmentGroup) getWithUpperSegmentBoundary(key []byte, topMostSegment int) ([]byte, error) {
   288  	// assumes "replace" strategy
   289  
   290  	// start with latest and exit as soon as something is found, thus making sure
   291  	// the latest takes presence
   292  	for i := topMostSegment; i >= 0; i-- {
   293  		v, err := sg.segments[i].get(key)
   294  		if err != nil {
   295  			if errors.Is(err, lsmkv.NotFound) {
   296  				continue
   297  			}
   298  
   299  			if errors.Is(err, lsmkv.Deleted) {
   300  				return nil, nil
   301  			}
   302  
   303  			panic(fmt.Sprintf("unsupported error in segmentGroup.get(): %v", err))
   304  		}
   305  
   306  		return v, nil
   307  	}
   308  
   309  	return nil, nil
   310  }
   311  
   312  func (sg *SegmentGroup) getBySecondaryIntoMemory(pos int, key []byte, buffer []byte) ([]byte, []byte, error) {
   313  	sg.maintenanceLock.RLock()
   314  	defer sg.maintenanceLock.RUnlock()
   315  
   316  	// assumes "replace" strategy
   317  
   318  	// start with latest and exit as soon as something is found, thus making sure
   319  	// the latest takes presence
   320  	for i := len(sg.segments) - 1; i >= 0; i-- {
   321  		v, err, allocatedBuff := sg.segments[i].getBySecondaryIntoMemory(pos, key, buffer)
   322  		if err != nil {
   323  			if errors.Is(err, lsmkv.NotFound) {
   324  				continue
   325  			}
   326  
   327  			if errors.Is(err, lsmkv.Deleted) {
   328  				return nil, nil, nil
   329  			}
   330  
   331  			panic(fmt.Sprintf("unsupported error in segmentGroup.get(): %v", err))
   332  		}
   333  
   334  		return v, allocatedBuff, nil
   335  	}
   336  
   337  	return nil, nil, nil
   338  }
   339  
   340  func (sg *SegmentGroup) getCollection(key []byte) ([]value, error) {
   341  	sg.maintenanceLock.RLock()
   342  	defer sg.maintenanceLock.RUnlock()
   343  
   344  	var out []value
   345  
   346  	// start with first and do not exit
   347  	for _, segment := range sg.segments {
   348  		v, err := segment.getCollection(key)
   349  		if err != nil {
   350  			if errors.Is(err, lsmkv.NotFound) {
   351  				continue
   352  			}
   353  
   354  			return nil, err
   355  		}
   356  
   357  		if len(out) == 0 {
   358  			out = v
   359  		} else {
   360  			out = append(out, v...)
   361  		}
   362  	}
   363  
   364  	return out, nil
   365  }
   366  
   367  func (sg *SegmentGroup) getCollectionBySegments(key []byte) ([][]value, error) {
   368  	sg.maintenanceLock.RLock()
   369  	defer sg.maintenanceLock.RUnlock()
   370  
   371  	out := make([][]value, len(sg.segments))
   372  
   373  	i := 0
   374  	// start with first and do not exit
   375  	for _, segment := range sg.segments {
   376  		v, err := segment.getCollection(key)
   377  		if err != nil {
   378  			if errors.Is(err, lsmkv.NotFound) {
   379  				continue
   380  			}
   381  
   382  			return nil, err
   383  		}
   384  
   385  		out[i] = v
   386  		i++
   387  	}
   388  
   389  	return out[:i], nil
   390  }
   391  
   392  func (sg *SegmentGroup) roaringSetGet(key []byte) (roaringset.BitmapLayers, error) {
   393  	sg.maintenanceLock.RLock()
   394  	defer sg.maintenanceLock.RUnlock()
   395  
   396  	var out roaringset.BitmapLayers
   397  
   398  	// start with first and do not exit
   399  	for _, segment := range sg.segments {
   400  		rs, err := segment.roaringSetGet(key)
   401  		if err != nil {
   402  			if errors.Is(err, lsmkv.NotFound) {
   403  				continue
   404  			}
   405  
   406  			return nil, err
   407  		}
   408  
   409  		out = append(out, rs)
   410  	}
   411  
   412  	return out, nil
   413  }
   414  
   415  func (sg *SegmentGroup) count() int {
   416  	sg.maintenanceLock.RLock()
   417  	defer sg.maintenanceLock.RUnlock()
   418  
   419  	count := 0
   420  	for _, seg := range sg.segments {
   421  		count += seg.countNetAdditions
   422  	}
   423  
   424  	return count
   425  }
   426  
   427  func (sg *SegmentGroup) shutdown(ctx context.Context) error {
   428  	if err := sg.compactionCallbackCtrl.Unregister(ctx); err != nil {
   429  		return fmt.Errorf("long-running compaction in progress: %w", ctx.Err())
   430  	}
   431  
   432  	// Lock acquirement placed after compaction cycle stop request, due to occasional deadlock,
   433  	// because compaction logic used in cycle also requires maintenance lock.
   434  	//
   435  	// If lock is grabbed by shutdown method and compaction in cycle loop starts right after,
   436  	// it is blocked waiting for the same lock, eventually blocking entire cycle loop and preventing to read stop signal.
   437  	// If stop signal can not be read, shutdown will not receive stop result and will not proceed with further execution.
   438  	// Maintenance lock will then never be released.
   439  	sg.maintenanceLock.Lock()
   440  	defer sg.maintenanceLock.Unlock()
   441  
   442  	for i, seg := range sg.segments {
   443  		if err := seg.close(); err != nil {
   444  			return err
   445  		}
   446  
   447  		sg.segments[i] = nil
   448  	}
   449  
   450  	// make sure the segment list itself is set to nil. In case a memtable will
   451  	// still flush after closing, it might try to read from a disk segment list
   452  	// otherwise and run into nil-pointer problems.
   453  	sg.segments = nil
   454  
   455  	return nil
   456  }
   457  
   458  func (sg *SegmentGroup) UpdateStatus(status storagestate.Status) {
   459  	sg.statusLock.Lock()
   460  	defer sg.statusLock.Unlock()
   461  
   462  	sg.status = status
   463  }
   464  
   465  func (sg *SegmentGroup) isReadyOnly() bool {
   466  	sg.statusLock.Lock()
   467  	defer sg.statusLock.Unlock()
   468  
   469  	return sg.status == storagestate.StatusReadOnly
   470  }
   471  
   472  func fileExists(path string) (bool, error) {
   473  	_, err := os.Stat(path)
   474  	if err == nil {
   475  		return true, nil
   476  	}
   477  
   478  	if errors.Is(err, fs.ErrNotExist) {
   479  		return false, nil
   480  	}
   481  
   482  	return false, err
   483  }