github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/lsmkv/bucket_recover_from_wal.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package lsmkv
    13  
    14  import (
    15  	"bufio"
    16  	"context"
    17  	"os"
    18  	"path/filepath"
    19  	"strings"
    20  	"time"
    21  
    22  	"github.com/pkg/errors"
    23  	"github.com/weaviate/weaviate/entities/diskio"
    24  )
    25  
    26  func (b *Bucket) mayRecoverFromCommitLogs(ctx context.Context) error {
    27  	beforeAll := time.Now()
    28  	defer b.metrics.TrackStartupBucketRecovery(beforeAll)
    29  
    30  	// the context is only ever checked once at the beginning, as there is no
    31  	// point in aborting an ongoing recovery. It makes more sense to let it
    32  	// complete and have the next recovery (this is called once per bucket) run
    33  	// into this error. This way in a crashloop we'd eventually recover each
    34  	// bucket until there is nothing left to recover and startup could complete
    35  	// in time
    36  	if err := ctx.Err(); err != nil {
    37  		return errors.Wrap(err, "recover commit log")
    38  	}
    39  
    40  	list, err := os.ReadDir(b.dir)
    41  	if err != nil {
    42  		return err
    43  	}
    44  
    45  	var walFileNames []string
    46  	for _, fileInfo := range list {
    47  		if filepath.Ext(fileInfo.Name()) != ".wal" {
    48  			// skip, this could be disk segments, etc.
    49  			continue
    50  		}
    51  
    52  		walFileNames = append(walFileNames, fileInfo.Name())
    53  	}
    54  
    55  	// recover from each log
    56  	for _, fname := range walFileNames {
    57  		path := filepath.Join(b.dir, strings.TrimSuffix(fname, ".wal"))
    58  
    59  		cl, err := newCommitLogger(path)
    60  		if err != nil {
    61  			return errors.Wrap(err, "init commit logger")
    62  		}
    63  		defer cl.close()
    64  
    65  		cl.pause()
    66  		defer cl.unpause()
    67  
    68  		mt, err := newMemtable(path, b.strategy, b.secondaryIndices, cl, b.metrics)
    69  		if err != nil {
    70  			return err
    71  		}
    72  
    73  		b.logger.WithField("action", "lsm_recover_from_active_wal").
    74  			WithField("path", path).
    75  			Warning("active write-ahead-log found. Did weaviate crash prior to this? Trying to recover...")
    76  
    77  		meteredReader := diskio.NewMeteredReader(bufio.NewReader(cl.file), b.metrics.TrackStartupReadWALDiskIO)
    78  
    79  		err = newCommitLoggerParser(b.strategy, meteredReader, mt).Do()
    80  		if err != nil {
    81  			b.logger.WithField("action", "lsm_recover_from_active_wal_corruption").
    82  				WithField("path", filepath.Join(b.dir, fname)).
    83  				Error(errors.Wrap(err, "write-ahead-log ended abruptly, some elements may not have been recovered"))
    84  		}
    85  
    86  		if err := mt.flush(); err != nil {
    87  			return errors.Wrap(err, "flush memtable after WAL recovery")
    88  		}
    89  
    90  		if mt.Size() == 0 {
    91  			continue
    92  		}
    93  
    94  		if err := b.disk.add(path + ".db"); err != nil {
    95  			return err
    96  		}
    97  
    98  		if b.strategy == StrategyReplace && b.monitorCount {
    99  			// having just flushed the memtable we now have the most up2date count which
   100  			// is a good place to update the metric
   101  			b.metrics.ObjectCount(b.disk.count())
   102  		}
   103  
   104  		b.logger.WithField("action", "lsm_recover_from_active_wal_success").
   105  			WithField("path", filepath.Join(b.dir, fname)).
   106  			Info("successfully recovered from write-ahead-log")
   107  	}
   108  
   109  	return nil
   110  }