github.com/thanos-io/thanos@v0.32.5/pkg/verifier/index_issue.go (about)

     1  // Copyright (c) The Thanos Authors.
     2  // Licensed under the Apache License 2.0.
     3  
     4  package verifier
     5  
     6  import (
     7  	"fmt"
     8  	"os"
     9  	"path"
    10  	"path/filepath"
    11  
    12  	"github.com/thanos-io/thanos/pkg/block/metadata"
    13  
    14  	"github.com/go-kit/log/level"
    15  	"github.com/oklog/ulid"
    16  	"github.com/pkg/errors"
    17  
    18  	"github.com/thanos-io/objstore"
    19  
    20  	"github.com/thanos-io/thanos/pkg/block"
    21  )
    22  
    23  // IndexKnownIssues verifies any known index issue.
    24  // It rewrites the problematic blocks while fixing repairable inconsistencies.
    25  // If the replacement was created successfully it is uploaded to the bucket and the input
    26  // block is deleted.
    27  // NOTE: This also verifies all indexes against chunks mismatches and duplicates.
    28  type IndexKnownIssues struct{}
    29  
    30  func (IndexKnownIssues) IssueID() string { return "index_known_issues" }
    31  
    32  func (IndexKnownIssues) VerifyRepair(ctx Context, idMatcher func(ulid.ULID) bool, repair bool) error {
    33  	level.Info(ctx.Logger).Log("msg", "started verifying issue", "with-repair", repair)
    34  
    35  	metas, _, err := ctx.Fetcher.Fetch(ctx)
    36  	if err != nil {
    37  		return err
    38  	}
    39  
    40  	for id, meta := range metas {
    41  		if idMatcher != nil && !idMatcher(id) {
    42  			continue
    43  		}
    44  
    45  		tmpdir, err := os.MkdirTemp("", fmt.Sprintf("index-issue-block-%s-", id))
    46  		if err != nil {
    47  			return err
    48  		}
    49  		defer func() {
    50  			if err := os.RemoveAll(tmpdir); err != nil {
    51  				level.Warn(ctx.Logger).Log("msg", "failed to delete dir", "tmpdir", tmpdir, "err", err)
    52  			}
    53  		}()
    54  
    55  		stats, err := verifyIndex(ctx, id, tmpdir, meta)
    56  		if err == nil {
    57  			level.Debug(ctx.Logger).Log("msg", "no issue", "id", id)
    58  			continue
    59  		}
    60  
    61  		level.Warn(ctx.Logger).Log("msg", "detected issue", "id", id, "err", err)
    62  
    63  		if !repair {
    64  			// Only verify.
    65  			continue
    66  		}
    67  
    68  		if err = repairIndex(stats, ctx, id, meta, tmpdir); err != nil {
    69  			level.Error(ctx.Logger).Log("msg", "could not repair index", "err", err)
    70  			continue
    71  		}
    72  		level.Info(ctx.Logger).Log("msg", "all good, continuing", "id", id)
    73  	}
    74  
    75  	level.Info(ctx.Logger).Log("msg", "verified issue", "with-repair", repair)
    76  	return nil
    77  }
    78  
    79  func repairIndex(stats block.HealthStats, ctx Context, id ulid.ULID, meta *metadata.Meta, dir string) (err error) {
    80  	if stats.OutOfOrderChunks > stats.DuplicatedChunks {
    81  		level.Warn(ctx.Logger).Log("msg", "detected overlaps are not entirely by duplicated chunks. We are able to repair only duplicates", "id", id)
    82  	}
    83  
    84  	if stats.OutsideChunks > (stats.CompleteOutsideChunks + stats.Issue347OutsideChunks) {
    85  		level.Warn(ctx.Logger).Log("msg", "detected outsiders are not all 'complete' outsiders or outsiders from https://github.com/prometheus/tsdb/issues/347. We can safely delete only these outsiders", "id", id)
    86  	}
    87  
    88  	if meta.Thanos.Downsample.Resolution > 0 {
    89  		return errors.Wrap(err, "cannot repair downsampled blocks")
    90  	}
    91  
    92  	level.Info(ctx.Logger).Log("msg", "downloading block for repair", "id", id)
    93  	if err = block.Download(ctx, ctx.Logger, ctx.Bkt, id, path.Join(dir, id.String())); err != nil {
    94  		return errors.Wrapf(err, "download block %s", id)
    95  	}
    96  	level.Info(ctx.Logger).Log("msg", "downloaded block to be repaired", "id", id, "issue")
    97  
    98  	level.Info(ctx.Logger).Log("msg", "repairing block", "id", id, "issue")
    99  	resid, err := block.Repair(
   100  		ctx.Logger,
   101  		dir,
   102  		id,
   103  		metadata.BucketRepairSource,
   104  		block.IgnoreCompleteOutsideChunk,
   105  		block.IgnoreDuplicateOutsideChunk,
   106  		block.IgnoreIssue347OutsideChunk,
   107  	)
   108  	if err != nil {
   109  		return errors.Wrapf(err, "repair failed for block %s", id)
   110  	}
   111  	level.Info(ctx.Logger).Log("msg", "verifying repaired block", "id", id, "newID", resid)
   112  
   113  	if err := block.VerifyIndex(ctx.Logger, filepath.Join(dir, resid.String(), block.IndexFilename), meta.MinTime, meta.MaxTime); err != nil {
   114  		return errors.Wrapf(err, "repaired block is invalid %s", resid)
   115  	}
   116  
   117  	level.Info(ctx.Logger).Log("msg", "uploading repaired block", "newID", resid)
   118  	if err = block.Upload(ctx, ctx.Logger, ctx.Bkt, filepath.Join(dir, resid.String()), metadata.NoneFunc); err != nil {
   119  		return errors.Wrapf(err, "upload of %s failed", resid)
   120  	}
   121  
   122  	level.Info(ctx.Logger).Log("msg", "safe deleting broken block", "id", id, "issue")
   123  	if err := BackupAndDeleteDownloaded(ctx, filepath.Join(dir, id.String()), id); err != nil {
   124  		return errors.Wrapf(err, "safe deleting old block %s failed", id)
   125  	}
   126  
   127  	return nil
   128  }
   129  
   130  func verifyIndex(ctx Context, id ulid.ULID, dir string, meta *metadata.Meta) (stats block.HealthStats, err error) {
   131  	if err := objstore.DownloadFile(ctx, ctx.Logger, ctx.Bkt, path.Join(id.String(), block.IndexFilename), filepath.Join(dir, block.IndexFilename)); err != nil {
   132  		return stats, errors.Wrapf(err, "download index file %s", path.Join(id.String(), block.IndexFilename))
   133  	}
   134  
   135  	stats, err = block.GatherIndexHealthStats(ctx.Logger, filepath.Join(dir, block.IndexFilename), meta.MinTime, meta.MaxTime)
   136  	if err != nil {
   137  		return stats, errors.Wrapf(err, "gather index issues %s", id)
   138  	}
   139  
   140  	level.Debug(ctx.Logger).Log("stats", fmt.Sprintf("%+v", stats), "id", id)
   141  
   142  	return stats, stats.AnyErr()
   143  }