github.com/thanos-io/thanos@v0.32.5/pkg/verifier/index_issue.go (about) 1 // Copyright (c) The Thanos Authors. 2 // Licensed under the Apache License 2.0. 3 4 package verifier 5 6 import ( 7 "fmt" 8 "os" 9 "path" 10 "path/filepath" 11 12 "github.com/thanos-io/thanos/pkg/block/metadata" 13 14 "github.com/go-kit/log/level" 15 "github.com/oklog/ulid" 16 "github.com/pkg/errors" 17 18 "github.com/thanos-io/objstore" 19 20 "github.com/thanos-io/thanos/pkg/block" 21 ) 22 23 // IndexKnownIssues verifies any known index issue. 24 // It rewrites the problematic blocks while fixing repairable inconsistencies. 25 // If the replacement was created successfully it is uploaded to the bucket and the input 26 // block is deleted. 27 // NOTE: This also verifies all indexes against chunks mismatches and duplicates. 28 type IndexKnownIssues struct{} 29 30 func (IndexKnownIssues) IssueID() string { return "index_known_issues" } 31 32 func (IndexKnownIssues) VerifyRepair(ctx Context, idMatcher func(ulid.ULID) bool, repair bool) error { 33 level.Info(ctx.Logger).Log("msg", "started verifying issue", "with-repair", repair) 34 35 metas, _, err := ctx.Fetcher.Fetch(ctx) 36 if err != nil { 37 return err 38 } 39 40 for id, meta := range metas { 41 if idMatcher != nil && !idMatcher(id) { 42 continue 43 } 44 45 tmpdir, err := os.MkdirTemp("", fmt.Sprintf("index-issue-block-%s-", id)) 46 if err != nil { 47 return err 48 } 49 defer func() { 50 if err := os.RemoveAll(tmpdir); err != nil { 51 level.Warn(ctx.Logger).Log("msg", "failed to delete dir", "tmpdir", tmpdir, "err", err) 52 } 53 }() 54 55 stats, err := verifyIndex(ctx, id, tmpdir, meta) 56 if err == nil { 57 level.Debug(ctx.Logger).Log("msg", "no issue", "id", id) 58 continue 59 } 60 61 level.Warn(ctx.Logger).Log("msg", "detected issue", "id", id, "err", err) 62 63 if !repair { 64 // Only verify. 65 continue 66 } 67 68 if err = repairIndex(stats, ctx, id, meta, tmpdir); err != nil { 69 level.Error(ctx.Logger).Log("msg", "could not repair index", "err", err) 70 continue 71 } 72 level.Info(ctx.Logger).Log("msg", "all good, continuing", "id", id) 73 } 74 75 level.Info(ctx.Logger).Log("msg", "verified issue", "with-repair", repair) 76 return nil 77 } 78 79 func repairIndex(stats block.HealthStats, ctx Context, id ulid.ULID, meta *metadata.Meta, dir string) (err error) { 80 if stats.OutOfOrderChunks > stats.DuplicatedChunks { 81 level.Warn(ctx.Logger).Log("msg", "detected overlaps are not entirely by duplicated chunks. We are able to repair only duplicates", "id", id) 82 } 83 84 if stats.OutsideChunks > (stats.CompleteOutsideChunks + stats.Issue347OutsideChunks) { 85 level.Warn(ctx.Logger).Log("msg", "detected outsiders are not all 'complete' outsiders or outsiders from https://github.com/prometheus/tsdb/issues/347. We can safely delete only these outsiders", "id", id) 86 } 87 88 if meta.Thanos.Downsample.Resolution > 0 { 89 return errors.Wrap(err, "cannot repair downsampled blocks") 90 } 91 92 level.Info(ctx.Logger).Log("msg", "downloading block for repair", "id", id) 93 if err = block.Download(ctx, ctx.Logger, ctx.Bkt, id, path.Join(dir, id.String())); err != nil { 94 return errors.Wrapf(err, "download block %s", id) 95 } 96 level.Info(ctx.Logger).Log("msg", "downloaded block to be repaired", "id", id, "issue") 97 98 level.Info(ctx.Logger).Log("msg", "repairing block", "id", id, "issue") 99 resid, err := block.Repair( 100 ctx.Logger, 101 dir, 102 id, 103 metadata.BucketRepairSource, 104 block.IgnoreCompleteOutsideChunk, 105 block.IgnoreDuplicateOutsideChunk, 106 block.IgnoreIssue347OutsideChunk, 107 ) 108 if err != nil { 109 return errors.Wrapf(err, "repair failed for block %s", id) 110 } 111 level.Info(ctx.Logger).Log("msg", "verifying repaired block", "id", id, "newID", resid) 112 113 if err := block.VerifyIndex(ctx.Logger, filepath.Join(dir, resid.String(), block.IndexFilename), meta.MinTime, meta.MaxTime); err != nil { 114 return errors.Wrapf(err, "repaired block is invalid %s", resid) 115 } 116 117 level.Info(ctx.Logger).Log("msg", "uploading repaired block", "newID", resid) 118 if err = block.Upload(ctx, ctx.Logger, ctx.Bkt, filepath.Join(dir, resid.String()), metadata.NoneFunc); err != nil { 119 return errors.Wrapf(err, "upload of %s failed", resid) 120 } 121 122 level.Info(ctx.Logger).Log("msg", "safe deleting broken block", "id", id, "issue") 123 if err := BackupAndDeleteDownloaded(ctx, filepath.Join(dir, id.String()), id); err != nil { 124 return errors.Wrapf(err, "safe deleting old block %s failed", id) 125 } 126 127 return nil 128 } 129 130 func verifyIndex(ctx Context, id ulid.ULID, dir string, meta *metadata.Meta) (stats block.HealthStats, err error) { 131 if err := objstore.DownloadFile(ctx, ctx.Logger, ctx.Bkt, path.Join(id.String(), block.IndexFilename), filepath.Join(dir, block.IndexFilename)); err != nil { 132 return stats, errors.Wrapf(err, "download index file %s", path.Join(id.String(), block.IndexFilename)) 133 } 134 135 stats, err = block.GatherIndexHealthStats(ctx.Logger, filepath.Join(dir, block.IndexFilename), meta.MinTime, meta.MaxTime) 136 if err != nil { 137 return stats, errors.Wrapf(err, "gather index issues %s", id) 138 } 139 140 level.Debug(ctx.Logger).Log("stats", fmt.Sprintf("%+v", stats), "id", id) 141 142 return stats, stats.AnyErr() 143 }